diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java index c0963695899..2b2a5e5f569 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java @@ -492,6 +492,28 @@ public void setZKDatabase(ZKDatabase zkDb) { * Restore sessions and data */ public void loadData() throws IOException, InterruptedException { + loadData(false); + } + + /** + * Restore sessions and data, optionally skipping the startup snapshot. + * + * During leader election, the synchronous snapshot in loadData() blocks + * quorum formation — on large ensembles (15M+ znodes) it takes 34-43s, + * which can exceed initLimit and cause repeated election failures. + * Skipping the snapshot is safe because: + * - Dead session cleanup (killSession) is idempotent on recovery + * - Follower sync uses the in-memory DataTree, not the disk snapshot + * - SyncRequestProcessor takes periodic snapshots after quorum forms + * - This matches the approach in ZOOKEEPER-1558 (branch-3.4, 2013) + * + * @param skipSnapshot if true, skip the startup snapshot. The periodic + * snapshot mechanism in SyncRequestProcessor will persist state + * after quorum is established and transactions begin flowing. + * @see ZOOKEEPER-1558 + * @see ZOOKEEPER-4766 + */ + public void loadData(boolean skipSnapshot) throws IOException, InterruptedException { /* * When a new leader starts executing Leader#lead, it * invokes this method. The database, however, has been @@ -529,8 +551,15 @@ public void loadData() throws IOException, InterruptedException { killSession(session, zkDb.getDataTreeLastProcessedZxid()); } - // Make a clean snapshot - takeSnapshot(); + if (skipSnapshot) { + LOG.info("Skipping startup snapshot (periodic snapshot will persist state). " + + "lastProcessedZxid: 0x{}, dead sessions cleaned: {}", + Long.toHexString(zkDb.getDataTreeLastProcessedZxid()), + deadSessions.size()); + } else { + // Make a clean snapshot + takeSnapshot(); + } } public void takeSnapshot() { diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java index e90df8f67a1..7677b99dc63 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java @@ -587,7 +587,7 @@ void lead() throws IOException, InterruptedException { try { self.setZabState(QuorumPeer.ZabState.DISCOVERY); self.tick.set(0); - zk.loadData(); + zk.loadData(self.isSkipLeaderStartupSnapshot()); leaderStateSummary = new StateSummary(self.getCurrentEpoch(), zk.getLastProcessedZxid()); diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java index 17705ef7b75..2f7a22a6ef3 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java @@ -720,6 +720,18 @@ public synchronized void setCurrentVote(Vote v) { */ protected volatile int initLimit; + /** + * Whether to skip the synchronous snapshot during leader startup. + * On large ensembles (15M+ znodes), the snapshot in loadData() takes + * 34-43s, blocking quorum formation and causing repeated election + * failures when it exceeds initLimit. Skipping it is safe — see + * ZOOKEEPER-1558 and ZOOKEEPER-4766. + */ + public static final String SKIP_LEADER_STARTUP_SNAPSHOT = + "zookeeper.leaderElection.skipStartupSnapshot"; + private volatile boolean skipLeaderStartupSnapshot = + Boolean.getBoolean(SKIP_LEADER_STARTUP_SNAPSHOT); + /** * The number of ticks that can pass between sending a request and getting * an acknowledgment @@ -1819,6 +1831,14 @@ public void setInitLimit(int initLimit) { this.initLimit = initLimit; } + public boolean isSkipLeaderStartupSnapshot() { + return skipLeaderStartupSnapshot; + } + + public void setSkipLeaderStartupSnapshot(boolean skip) { + this.skipLeaderStartupSnapshot = skip; + } + /** * Get the current tick */ diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java index 81469a8f858..5956841fcfd 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java @@ -26,6 +26,7 @@ import java.util.List; import org.apache.zookeeper.ZKTestCase; import org.apache.zookeeper.server.persistence.FileTxnLog; +import org.apache.zookeeper.server.persistence.FileTxnSnapLog; import org.apache.zookeeper.server.persistence.SnapStream; import org.apache.zookeeper.server.persistence.Util; import org.apache.zookeeper.test.ClientBase; @@ -135,4 +136,111 @@ public void testInvalidSnapshot() { } } + /** + * Helper: create a ZooKeeperServer with an initialized database. + * This simulates the state during leader election where the DB + * is already loaded (QuorumPeer.start() loads it before election). + */ + private ZooKeeperServer createServerWithInitializedDb(File tmpDir) throws Exception { + FileTxnSnapLog snapLog = new FileTxnSnapLog(tmpDir, tmpDir); + ZKDatabase zkDb = new ZKDatabase(snapLog); + // Initialize DB (as QuorumPeer.start() would do before election) + zkDb.loadDataBase(); + ZooKeeperServer zks = new ZooKeeperServer(snapLog, 2000, null); + zks.setZKDatabase(zkDb); + return zks; + } + + private static int countSnapshotFiles(File dir) { + if (dir == null || !dir.exists()) { + return 0; + } + File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot.")); + return snaps == null ? 0 : snaps.length; + } + + /** + * Test that loadData(false) takes a snapshot (default behavior). + * We detect this by checking that the snapshot file's modification + * time is updated (the file name stays the same since zxid is unchanged). + */ + @Test + public void testLoadDataTakesSnapshotByDefault() throws Exception { + File tmpDir = ClientBase.createTmpDir(); + try { + ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + // 1100ms covers HFS+ 1s mtime granularity on macOS dev hosts (APFS/ext4 have finer) + Thread.sleep(1100); + + zks.loadData(false); + + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertTrue("Snapshot file should be updated when skipSnapshot=false", + lastModAfter > lastModBefore); + } finally { + ClientBase.recursiveDelete(tmpDir); + } + } + + /** + * Test that loadData(true) skips the snapshot — file is NOT rewritten. + */ + @Test + public void testLoadDataSkipsSnapshotWhenRequested() throws Exception { + File tmpDir = ClientBase.createTmpDir(); + try { + ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + Thread.sleep(1100); + + zks.loadData(true); + + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertEquals("Snapshot file should NOT be updated when skipSnapshot=true", + lastModBefore, lastModAfter); + } finally { + ClientBase.recursiveDelete(tmpDir); + } + } + + /** + * Test that loadData() no-arg delegates to loadData(false) — takes snapshot. + */ + @Test + public void testLoadDataNoArgDelegatesToDefault() throws Exception { + File tmpDir = ClientBase.createTmpDir(); + try { + ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + Thread.sleep(1100); + + zks.loadData(); + + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertTrue("No-arg loadData() should take snapshot (backward compatible)", + lastModAfter > lastModBefore); + } finally { + ClientBase.recursiveDelete(tmpDir); + } + } + + private static long getLatestSnapshotModTime(File dir) { + if (dir == null || !dir.exists()) { + return 0; + } + File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot.")); + if (snaps == null || snaps.length == 0) { + return 0; + } + long latest = 0; + for (File f : snaps) { + latest = Math.max(latest, f.lastModified()); + } + return latest; + } + } diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java index 03cb6f06a83..b28e56985af 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java @@ -107,4 +107,31 @@ public void testIsNotLeaderBecauseNoVote() throws Exception { assertFalse(peer.isLeader(localPeerId)); } + @Test + public void testSkipLeaderStartupSnapshotDefaultsToFalse() throws Exception { + // Guard against test-ordering pollution: another test or a Surefire forked-JVM + // default could leave the property set. Clear it for this default-value assertion + // and restore on exit. + String prior = System.getProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT); + System.clearProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT); + try { + QuorumPeer peer = new QuorumPeer(); + assertFalse("skipLeaderStartupSnapshot should default to false", + peer.isSkipLeaderStartupSnapshot()); + } finally { + if (prior != null) { + System.setProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT, prior); + } + } + } + + @Test + public void testSkipLeaderStartupSnapshotSetterGetter() throws Exception { + QuorumPeer peer = new QuorumPeer(); + peer.setSkipLeaderStartupSnapshot(true); + assertTrue(peer.isSkipLeaderStartupSnapshot()); + peer.setSkipLeaderStartupSnapshot(false); + assertFalse(peer.isSkipLeaderStartupSnapshot()); + } + } diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java index b8630ae1319..07e9b6d15de 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java @@ -822,6 +822,158 @@ private void proposeNewSession(QuorumPacket qp, long zxid, long sessionId) throw }); } + /** + * Test that leader election skips snapshot in loadData() when + * skipLeaderStartupSnapshot is enabled. This verifies the fix for + * ZOOKEEPER-4766: leader election time should not scale with tree size. + */ + @Test + public void testLeaderSkipsSnapshotWhenConfigured() throws Exception { + File tmpDir = File.createTempFile("test", "dir", testData); + tmpDir.delete(); + tmpDir.mkdir(); + LeadThread leadThread = null; + Leader leader = null; + try { + QuorumPeer peer = createQuorumPeer(tmpDir); + // Enable skip-snapshot + peer.setSkipLeaderStartupSnapshot(true); + + leader = createLeader(tmpDir, peer); + peer.leader = leader; + + // Pre-initialize the database (as QuorumPeer.start() does before election). + // This ensures zkDb.isInitialized()==true so loadData() won't call + // loadDataBase() which itself creates an initial snapshot. + leader.zk.getZKDatabase().loadDataBase(); + + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + // 1100ms covers HFS+ 1s mtime granularity on macOS dev hosts + Thread.sleep(1100); + + leadThread = new LeadThread(leader); + leadThread.start(); + + waitForCnxAcceptor(leader); + + // The leader is now past loadData(). With skipSnapshot=true, + // the snapshot file should NOT have been rewritten. + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertEquals("Snapshot should NOT be rewritten when skipLeaderStartupSnapshot=true", + lastModBefore, lastModAfter); + } finally { + if (leader != null) { + leader.shutdown("end of test"); + } + if (leadThread != null) { + leadThread.interrupt(); + leadThread.join(); + } + TestUtils.deleteFileRecursively(tmpDir); + } + } + + /** + * Test that leader election DOES take snapshot when + * skipLeaderStartupSnapshot is disabled (default behavior). + */ + @Test + public void testLeaderTakesSnapshotByDefault() throws Exception { + File tmpDir = File.createTempFile("test", "dir", testData); + tmpDir.delete(); + tmpDir.mkdir(); + LeadThread leadThread = null; + Leader leader = null; + try { + QuorumPeer peer = createQuorumPeer(tmpDir); + // Explicitly disable (default) + peer.setSkipLeaderStartupSnapshot(false); + + leader = createLeader(tmpDir, peer); + peer.leader = leader; + + // Pre-initialize the database (as QuorumPeer.start() does before election) + leader.zk.getZKDatabase().loadDataBase(); + + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + Thread.sleep(1100); + + leadThread = new LeadThread(leader); + leadThread.start(); + + waitForCnxAcceptor(leader); + + // With skipSnapshot=false (default), the snapshot should have been rewritten + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertTrue("Snapshot should be rewritten when skipLeaderStartupSnapshot=false", + lastModAfter > lastModBefore); + } finally { + if (leader != null) { + leader.shutdown("end of test"); + } + if (leadThread != null) { + leadThread.interrupt(); + leadThread.join(); + } + TestUtils.deleteFileRecursively(tmpDir); + } + } + + private static long getLatestSnapshotModTime(File dir) { + if (dir == null || !dir.exists()) { + return 0; + } + File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot.")); + if (snaps == null || snaps.length == 0) { + return 0; + } + long latest = 0; + for (File f : snaps) { + latest = Math.max(latest, f.lastModified()); + } + return latest; + } + + /** + * Poll for cnxAcceptor liveness with a deadline. Without the bound, a leader + * thread that dies in loadData() (e.g., snapshot IOException) would hang the + * test until JUnit times out the whole suite — slow + opaque failure mode. + */ + private static void waitForCnxAcceptor(Leader leader) throws InterruptedException { + long deadline = System.currentTimeMillis() + 10_000; + while (leader.cnxAcceptor == null || !leader.cnxAcceptor.isAlive()) { + if (System.currentTimeMillis() > deadline) { + fail("Leader did not start cnxAcceptor within 10s; loadData() may have failed"); + } + Thread.sleep(20); + } + } + + /** + * Regression guard for the follower path: the skipLeaderStartupSnapshot flag is + * leader-path only (read at Leader.java:590). The follower codepath + * (Learner.syncWithLeader) must not branch on it. Re-runs the standard follower + * DIFF-sync conversation with the flag globally enabled; if a future refactor + * leaks the flag into Learner / Follower, this test will diverge from + * testNormalFollowerRunWithDiff and flag the regression. + */ + @Test + public void testFollowerPathUnaffectedBySkipFlag() throws Exception { + String prior = System.getProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT); + System.setProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT, "true"); + try { + testNormalFollowerRunWithDiff(); + } finally { + if (prior == null) { + System.clearProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT); + } else { + System.setProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT, prior); + } + } + } + @Test public void testNormalRun() throws Exception { testLeaderConversation(new LeaderConversation() {