diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java
index c0963695899..2b2a5e5f569 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java
@@ -492,6 +492,28 @@ public void setZKDatabase(ZKDatabase zkDb) {
* Restore sessions and data
*/
public void loadData() throws IOException, InterruptedException {
+ loadData(false);
+ }
+
+ /**
+ * Restore sessions and data, optionally skipping the startup snapshot.
+ *
+ * During leader election, the synchronous snapshot in loadData() blocks
+ * quorum formation — on large ensembles (15M+ znodes) it takes 34-43s,
+ * which can exceed initLimit and cause repeated election failures.
+ * Skipping the snapshot is safe because:
+ * - Dead session cleanup (killSession) is idempotent on recovery
+ * - Follower sync uses the in-memory DataTree, not the disk snapshot
+ * - SyncRequestProcessor takes periodic snapshots after quorum forms
+ * - This matches the approach in ZOOKEEPER-1558 (branch-3.4, 2013)
+ *
+ * @param skipSnapshot if true, skip the startup snapshot. The periodic
+ * snapshot mechanism in SyncRequestProcessor will persist state
+ * after quorum is established and transactions begin flowing.
+ * @see ZOOKEEPER-1558
+ * @see ZOOKEEPER-4766
+ */
+ public void loadData(boolean skipSnapshot) throws IOException, InterruptedException {
/*
* When a new leader starts executing Leader#lead, it
* invokes this method. The database, however, has been
@@ -529,8 +551,15 @@ public void loadData() throws IOException, InterruptedException {
killSession(session, zkDb.getDataTreeLastProcessedZxid());
}
- // Make a clean snapshot
- takeSnapshot();
+ if (skipSnapshot) {
+ LOG.info("Skipping startup snapshot (periodic snapshot will persist state). "
+ + "lastProcessedZxid: 0x{}, dead sessions cleaned: {}",
+ Long.toHexString(zkDb.getDataTreeLastProcessedZxid()),
+ deadSessions.size());
+ } else {
+ // Make a clean snapshot
+ takeSnapshot();
+ }
}
public void takeSnapshot() {
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java
index e90df8f67a1..7677b99dc63 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java
@@ -587,7 +587,7 @@ void lead() throws IOException, InterruptedException {
try {
self.setZabState(QuorumPeer.ZabState.DISCOVERY);
self.tick.set(0);
- zk.loadData();
+ zk.loadData(self.isSkipLeaderStartupSnapshot());
leaderStateSummary = new StateSummary(self.getCurrentEpoch(), zk.getLastProcessedZxid());
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
index 17705ef7b75..2f7a22a6ef3 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
@@ -720,6 +720,18 @@ public synchronized void setCurrentVote(Vote v) {
*/
protected volatile int initLimit;
+ /**
+ * Whether to skip the synchronous snapshot during leader startup.
+ * On large ensembles (15M+ znodes), the snapshot in loadData() takes
+ * 34-43s, blocking quorum formation and causing repeated election
+ * failures when it exceeds initLimit. Skipping it is safe — see
+ * ZOOKEEPER-1558 and ZOOKEEPER-4766.
+ */
+ public static final String SKIP_LEADER_STARTUP_SNAPSHOT =
+ "zookeeper.leaderElection.skipStartupSnapshot";
+ private volatile boolean skipLeaderStartupSnapshot =
+ Boolean.getBoolean(SKIP_LEADER_STARTUP_SNAPSHOT);
+
/**
* The number of ticks that can pass between sending a request and getting
* an acknowledgment
@@ -1819,6 +1831,14 @@ public void setInitLimit(int initLimit) {
this.initLimit = initLimit;
}
+ public boolean isSkipLeaderStartupSnapshot() {
+ return skipLeaderStartupSnapshot;
+ }
+
+ public void setSkipLeaderStartupSnapshot(boolean skip) {
+ this.skipLeaderStartupSnapshot = skip;
+ }
+
/**
* Get the current tick
*/
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java
index 81469a8f858..5956841fcfd 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java
@@ -26,6 +26,7 @@
import java.util.List;
import org.apache.zookeeper.ZKTestCase;
import org.apache.zookeeper.server.persistence.FileTxnLog;
+import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
import org.apache.zookeeper.server.persistence.SnapStream;
import org.apache.zookeeper.server.persistence.Util;
import org.apache.zookeeper.test.ClientBase;
@@ -135,4 +136,111 @@ public void testInvalidSnapshot() {
}
}
+ /**
+ * Helper: create a ZooKeeperServer with an initialized database.
+ * This simulates the state during leader election where the DB
+ * is already loaded (QuorumPeer.start() loads it before election).
+ */
+ private ZooKeeperServer createServerWithInitializedDb(File tmpDir) throws Exception {
+ FileTxnSnapLog snapLog = new FileTxnSnapLog(tmpDir, tmpDir);
+ ZKDatabase zkDb = new ZKDatabase(snapLog);
+ // Initialize DB (as QuorumPeer.start() would do before election)
+ zkDb.loadDataBase();
+ ZooKeeperServer zks = new ZooKeeperServer(snapLog, 2000, null);
+ zks.setZKDatabase(zkDb);
+ return zks;
+ }
+
+ private static int countSnapshotFiles(File dir) {
+ if (dir == null || !dir.exists()) {
+ return 0;
+ }
+ File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot."));
+ return snaps == null ? 0 : snaps.length;
+ }
+
+ /**
+ * Test that loadData(false) takes a snapshot (default behavior).
+ * We detect this by checking that the snapshot file's modification
+ * time is updated (the file name stays the same since zxid is unchanged).
+ */
+ @Test
+ public void testLoadDataTakesSnapshotByDefault() throws Exception {
+ File tmpDir = ClientBase.createTmpDir();
+ try {
+ ZooKeeperServer zks = createServerWithInitializedDb(tmpDir);
+ File snapDir = new File(tmpDir, "version-2");
+ long lastModBefore = getLatestSnapshotModTime(snapDir);
+ // 1100ms covers HFS+ 1s mtime granularity on macOS dev hosts (APFS/ext4 have finer)
+ Thread.sleep(1100);
+
+ zks.loadData(false);
+
+ long lastModAfter = getLatestSnapshotModTime(snapDir);
+ assertTrue("Snapshot file should be updated when skipSnapshot=false",
+ lastModAfter > lastModBefore);
+ } finally {
+ ClientBase.recursiveDelete(tmpDir);
+ }
+ }
+
+ /**
+ * Test that loadData(true) skips the snapshot — file is NOT rewritten.
+ */
+ @Test
+ public void testLoadDataSkipsSnapshotWhenRequested() throws Exception {
+ File tmpDir = ClientBase.createTmpDir();
+ try {
+ ZooKeeperServer zks = createServerWithInitializedDb(tmpDir);
+ File snapDir = new File(tmpDir, "version-2");
+ long lastModBefore = getLatestSnapshotModTime(snapDir);
+ Thread.sleep(1100);
+
+ zks.loadData(true);
+
+ long lastModAfter = getLatestSnapshotModTime(snapDir);
+ assertEquals("Snapshot file should NOT be updated when skipSnapshot=true",
+ lastModBefore, lastModAfter);
+ } finally {
+ ClientBase.recursiveDelete(tmpDir);
+ }
+ }
+
+ /**
+ * Test that loadData() no-arg delegates to loadData(false) — takes snapshot.
+ */
+ @Test
+ public void testLoadDataNoArgDelegatesToDefault() throws Exception {
+ File tmpDir = ClientBase.createTmpDir();
+ try {
+ ZooKeeperServer zks = createServerWithInitializedDb(tmpDir);
+ File snapDir = new File(tmpDir, "version-2");
+ long lastModBefore = getLatestSnapshotModTime(snapDir);
+ Thread.sleep(1100);
+
+ zks.loadData();
+
+ long lastModAfter = getLatestSnapshotModTime(snapDir);
+ assertTrue("No-arg loadData() should take snapshot (backward compatible)",
+ lastModAfter > lastModBefore);
+ } finally {
+ ClientBase.recursiveDelete(tmpDir);
+ }
+ }
+
+ private static long getLatestSnapshotModTime(File dir) {
+ if (dir == null || !dir.exists()) {
+ return 0;
+ }
+ File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot."));
+ if (snaps == null || snaps.length == 0) {
+ return 0;
+ }
+ long latest = 0;
+ for (File f : snaps) {
+ latest = Math.max(latest, f.lastModified());
+ }
+ return latest;
+ }
+
}
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java
index 03cb6f06a83..b28e56985af 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java
@@ -107,4 +107,31 @@ public void testIsNotLeaderBecauseNoVote() throws Exception {
assertFalse(peer.isLeader(localPeerId));
}
+ @Test
+ public void testSkipLeaderStartupSnapshotDefaultsToFalse() throws Exception {
+ // Guard against test-ordering pollution: another test or a Surefire forked-JVM
+ // default could leave the property set. Clear it for this default-value assertion
+ // and restore on exit.
+ String prior = System.getProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT);
+ System.clearProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT);
+ try {
+ QuorumPeer peer = new QuorumPeer();
+ assertFalse("skipLeaderStartupSnapshot should default to false",
+ peer.isSkipLeaderStartupSnapshot());
+ } finally {
+ if (prior != null) {
+ System.setProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT, prior);
+ }
+ }
+ }
+
+ @Test
+ public void testSkipLeaderStartupSnapshotSetterGetter() throws Exception {
+ QuorumPeer peer = new QuorumPeer();
+ peer.setSkipLeaderStartupSnapshot(true);
+ assertTrue(peer.isSkipLeaderStartupSnapshot());
+ peer.setSkipLeaderStartupSnapshot(false);
+ assertFalse(peer.isSkipLeaderStartupSnapshot());
+ }
+
}
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java
index b8630ae1319..07e9b6d15de 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java
@@ -822,6 +822,158 @@ private void proposeNewSession(QuorumPacket qp, long zxid, long sessionId) throw
});
}
+ /**
+ * Test that leader election skips snapshot in loadData() when
+ * skipLeaderStartupSnapshot is enabled. This verifies the fix for
+ * ZOOKEEPER-4766: leader election time should not scale with tree size.
+ */
+ @Test
+ public void testLeaderSkipsSnapshotWhenConfigured() throws Exception {
+ File tmpDir = File.createTempFile("test", "dir", testData);
+ tmpDir.delete();
+ tmpDir.mkdir();
+ LeadThread leadThread = null;
+ Leader leader = null;
+ try {
+ QuorumPeer peer = createQuorumPeer(tmpDir);
+ // Enable skip-snapshot
+ peer.setSkipLeaderStartupSnapshot(true);
+
+ leader = createLeader(tmpDir, peer);
+ peer.leader = leader;
+
+ // Pre-initialize the database (as QuorumPeer.start() does before election).
+ // This ensures zkDb.isInitialized()==true so loadData() won't call
+ // loadDataBase() which itself creates an initial snapshot.
+ leader.zk.getZKDatabase().loadDataBase();
+
+ File snapDir = new File(tmpDir, "version-2");
+ long lastModBefore = getLatestSnapshotModTime(snapDir);
+ // 1100ms covers HFS+ 1s mtime granularity on macOS dev hosts
+ Thread.sleep(1100);
+
+ leadThread = new LeadThread(leader);
+ leadThread.start();
+
+ waitForCnxAcceptor(leader);
+
+ // The leader is now past loadData(). With skipSnapshot=true,
+ // the snapshot file should NOT have been rewritten.
+ long lastModAfter = getLatestSnapshotModTime(snapDir);
+ assertEquals("Snapshot should NOT be rewritten when skipLeaderStartupSnapshot=true",
+ lastModBefore, lastModAfter);
+ } finally {
+ if (leader != null) {
+ leader.shutdown("end of test");
+ }
+ if (leadThread != null) {
+ leadThread.interrupt();
+ leadThread.join();
+ }
+ TestUtils.deleteFileRecursively(tmpDir);
+ }
+ }
+
+ /**
+ * Test that leader election DOES take snapshot when
+ * skipLeaderStartupSnapshot is disabled (default behavior).
+ */
+ @Test
+ public void testLeaderTakesSnapshotByDefault() throws Exception {
+ File tmpDir = File.createTempFile("test", "dir", testData);
+ tmpDir.delete();
+ tmpDir.mkdir();
+ LeadThread leadThread = null;
+ Leader leader = null;
+ try {
+ QuorumPeer peer = createQuorumPeer(tmpDir);
+ // Explicitly disable (default)
+ peer.setSkipLeaderStartupSnapshot(false);
+
+ leader = createLeader(tmpDir, peer);
+ peer.leader = leader;
+
+ // Pre-initialize the database (as QuorumPeer.start() does before election)
+ leader.zk.getZKDatabase().loadDataBase();
+
+ File snapDir = new File(tmpDir, "version-2");
+ long lastModBefore = getLatestSnapshotModTime(snapDir);
+ Thread.sleep(1100);
+
+ leadThread = new LeadThread(leader);
+ leadThread.start();
+
+ waitForCnxAcceptor(leader);
+
+ // With skipSnapshot=false (default), the snapshot should have been rewritten
+ long lastModAfter = getLatestSnapshotModTime(snapDir);
+ assertTrue("Snapshot should be rewritten when skipLeaderStartupSnapshot=false",
+ lastModAfter > lastModBefore);
+ } finally {
+ if (leader != null) {
+ leader.shutdown("end of test");
+ }
+ if (leadThread != null) {
+ leadThread.interrupt();
+ leadThread.join();
+ }
+ TestUtils.deleteFileRecursively(tmpDir);
+ }
+ }
+
+ private static long getLatestSnapshotModTime(File dir) {
+ if (dir == null || !dir.exists()) {
+ return 0;
+ }
+ File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot."));
+ if (snaps == null || snaps.length == 0) {
+ return 0;
+ }
+ long latest = 0;
+ for (File f : snaps) {
+ latest = Math.max(latest, f.lastModified());
+ }
+ return latest;
+ }
+
+ /**
+ * Poll for cnxAcceptor liveness with a deadline. Without the bound, a leader
+ * thread that dies in loadData() (e.g., snapshot IOException) would hang the
+ * test until JUnit times out the whole suite — slow + opaque failure mode.
+ */
+ private static void waitForCnxAcceptor(Leader leader) throws InterruptedException {
+ long deadline = System.currentTimeMillis() + 10_000;
+ while (leader.cnxAcceptor == null || !leader.cnxAcceptor.isAlive()) {
+ if (System.currentTimeMillis() > deadline) {
+ fail("Leader did not start cnxAcceptor within 10s; loadData() may have failed");
+ }
+ Thread.sleep(20);
+ }
+ }
+
+ /**
+ * Regression guard for the follower path: the skipLeaderStartupSnapshot flag is
+ * leader-path only (read at Leader.java:590). The follower codepath
+ * (Learner.syncWithLeader) must not branch on it. Re-runs the standard follower
+ * DIFF-sync conversation with the flag globally enabled; if a future refactor
+ * leaks the flag into Learner / Follower, this test will diverge from
+ * testNormalFollowerRunWithDiff and flag the regression.
+ */
+ @Test
+ public void testFollowerPathUnaffectedBySkipFlag() throws Exception {
+ String prior = System.getProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT);
+ System.setProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT, "true");
+ try {
+ testNormalFollowerRunWithDiff();
+ } finally {
+ if (prior == null) {
+ System.clearProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT);
+ } else {
+ System.setProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT, prior);
+ }
+ }
+ }
+
@Test
public void testNormalRun() throws Exception {
testLeaderConversation(new LeaderConversation() {