Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,28 @@ public void setZKDatabase(ZKDatabase zkDb) {
* Restore sessions and data
*/
public void loadData() throws IOException, InterruptedException {
loadData(false);
}

/**
* Restore sessions and data, optionally skipping the startup snapshot.
*
* During leader election, the synchronous snapshot in loadData() blocks
* quorum formation — on large ensembles (15M+ znodes) it takes 34-43s,
* which can exceed initLimit and cause repeated election failures.
* Skipping the snapshot is safe because:
* - Dead session cleanup (killSession) is idempotent on recovery
* - Follower sync uses the in-memory DataTree, not the disk snapshot
* - SyncRequestProcessor takes periodic snapshots after quorum forms
* - This matches the approach in ZOOKEEPER-1558 (branch-3.4, 2013)
*
* @param skipSnapshot if true, skip the startup snapshot. The periodic
* snapshot mechanism in SyncRequestProcessor will persist state
* after quorum is established and transactions begin flowing.
* @see <a href="https://issues.apache.org/jira/browse/ZOOKEEPER-1558">ZOOKEEPER-1558</a>
* @see <a href="https://issues.apache.org/jira/browse/ZOOKEEPER-4766">ZOOKEEPER-4766</a>
*/
public void loadData(boolean skipSnapshot) throws IOException, InterruptedException {
/*
* When a new leader starts executing Leader#lead, it
* invokes this method. The database, however, has been
Expand Down Expand Up @@ -529,8 +551,15 @@ public void loadData() throws IOException, InterruptedException {
killSession(session, zkDb.getDataTreeLastProcessedZxid());
}

// Make a clean snapshot
takeSnapshot();
if (skipSnapshot) {
LOG.info("Skipping startup snapshot (periodic snapshot will persist state). "
Comment thread
laxman-ch marked this conversation as resolved.
+ "lastProcessedZxid: 0x{}, dead sessions cleaned: {}",
Long.toHexString(zkDb.getDataTreeLastProcessedZxid()),
deadSessions.size());
} else {
// Make a clean snapshot
takeSnapshot();
}
}

public void takeSnapshot() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ void lead() throws IOException, InterruptedException {
try {
self.setZabState(QuorumPeer.ZabState.DISCOVERY);
self.tick.set(0);
zk.loadData();
zk.loadData(self.isSkipLeaderStartupSnapshot());

leaderStateSummary = new StateSummary(self.getCurrentEpoch(), zk.getLastProcessedZxid());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,18 @@ public synchronized void setCurrentVote(Vote v) {
*/
protected volatile int initLimit;

/**
* Whether to skip the synchronous snapshot during leader startup.
* On large ensembles (15M+ znodes), the snapshot in loadData() takes
* 34-43s, blocking quorum formation and causing repeated election
* failures when it exceeds initLimit. Skipping it is safe — see
* ZOOKEEPER-1558 and ZOOKEEPER-4766.
*/
public static final String SKIP_LEADER_STARTUP_SNAPSHOT =
"zookeeper.leaderElection.skipStartupSnapshot";
Comment thread
laxman-ch marked this conversation as resolved.
private volatile boolean skipLeaderStartupSnapshot =
Boolean.getBoolean(SKIP_LEADER_STARTUP_SNAPSHOT);

/**
* The number of ticks that can pass between sending a request and getting
* an acknowledgment
Expand Down Expand Up @@ -1819,6 +1831,14 @@ public void setInitLimit(int initLimit) {
this.initLimit = initLimit;
}

public boolean isSkipLeaderStartupSnapshot() {
return skipLeaderStartupSnapshot;
}

public void setSkipLeaderStartupSnapshot(boolean skip) {
this.skipLeaderStartupSnapshot = skip;
}

/**
* Get the current tick
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.util.List;
import org.apache.zookeeper.ZKTestCase;
import org.apache.zookeeper.server.persistence.FileTxnLog;
import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
import org.apache.zookeeper.server.persistence.SnapStream;
import org.apache.zookeeper.server.persistence.Util;
import org.apache.zookeeper.test.ClientBase;
Expand Down Expand Up @@ -135,4 +136,111 @@ public void testInvalidSnapshot() {
}
}

/**
* Helper: create a ZooKeeperServer with an initialized database.
* This simulates the state during leader election where the DB
* is already loaded (QuorumPeer.start() loads it before election).
*/
private ZooKeeperServer createServerWithInitializedDb(File tmpDir) throws Exception {
FileTxnSnapLog snapLog = new FileTxnSnapLog(tmpDir, tmpDir);
ZKDatabase zkDb = new ZKDatabase(snapLog);
// Initialize DB (as QuorumPeer.start() would do before election)
zkDb.loadDataBase();
ZooKeeperServer zks = new ZooKeeperServer(snapLog, 2000, null);
zks.setZKDatabase(zkDb);
return zks;
}

private static int countSnapshotFiles(File dir) {
if (dir == null || !dir.exists()) {
return 0;
}
File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot."));
return snaps == null ? 0 : snaps.length;
}

/**
* Test that loadData(false) takes a snapshot (default behavior).
* We detect this by checking that the snapshot file's modification
* time is updated (the file name stays the same since zxid is unchanged).
*/
@Test
public void testLoadDataTakesSnapshotByDefault() throws Exception {
File tmpDir = ClientBase.createTmpDir();
try {
ZooKeeperServer zks = createServerWithInitializedDb(tmpDir);
File snapDir = new File(tmpDir, "version-2");
long lastModBefore = getLatestSnapshotModTime(snapDir);
// 1100ms covers HFS+ 1s mtime granularity on macOS dev hosts (APFS/ext4 have finer)
Thread.sleep(1100);

zks.loadData(false);

long lastModAfter = getLatestSnapshotModTime(snapDir);
assertTrue("Snapshot file should be updated when skipSnapshot=false",
lastModAfter > lastModBefore);
} finally {
ClientBase.recursiveDelete(tmpDir);
}
}

/**
* Test that loadData(true) skips the snapshot — file is NOT rewritten.
*/
@Test
public void testLoadDataSkipsSnapshotWhenRequested() throws Exception {
File tmpDir = ClientBase.createTmpDir();
try {
ZooKeeperServer zks = createServerWithInitializedDb(tmpDir);
File snapDir = new File(tmpDir, "version-2");
long lastModBefore = getLatestSnapshotModTime(snapDir);
Thread.sleep(1100);

zks.loadData(true);

long lastModAfter = getLatestSnapshotModTime(snapDir);
assertEquals("Snapshot file should NOT be updated when skipSnapshot=true",
lastModBefore, lastModAfter);
} finally {
ClientBase.recursiveDelete(tmpDir);
}
}

/**
* Test that loadData() no-arg delegates to loadData(false) — takes snapshot.
*/
@Test
public void testLoadDataNoArgDelegatesToDefault() throws Exception {
File tmpDir = ClientBase.createTmpDir();
try {
ZooKeeperServer zks = createServerWithInitializedDb(tmpDir);
File snapDir = new File(tmpDir, "version-2");
long lastModBefore = getLatestSnapshotModTime(snapDir);
Thread.sleep(1100);

zks.loadData();

long lastModAfter = getLatestSnapshotModTime(snapDir);
assertTrue("No-arg loadData() should take snapshot (backward compatible)",
lastModAfter > lastModBefore);
} finally {
ClientBase.recursiveDelete(tmpDir);
}
}

private static long getLatestSnapshotModTime(File dir) {
if (dir == null || !dir.exists()) {
return 0;
}
File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot."));
if (snaps == null || snaps.length == 0) {
return 0;
}
long latest = 0;
for (File f : snaps) {
latest = Math.max(latest, f.lastModified());
}
return latest;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,31 @@ public void testIsNotLeaderBecauseNoVote() throws Exception {
assertFalse(peer.isLeader(localPeerId));
}

@Test
public void testSkipLeaderStartupSnapshotDefaultsToFalse() throws Exception {
// Guard against test-ordering pollution: another test or a Surefire forked-JVM
// default could leave the property set. Clear it for this default-value assertion
// and restore on exit.
String prior = System.getProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT);
System.clearProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT);
try {
QuorumPeer peer = new QuorumPeer();
assertFalse("skipLeaderStartupSnapshot should default to false",
peer.isSkipLeaderStartupSnapshot());
} finally {
if (prior != null) {
System.setProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT, prior);
}
}
}

@Test
public void testSkipLeaderStartupSnapshotSetterGetter() throws Exception {
QuorumPeer peer = new QuorumPeer();
peer.setSkipLeaderStartupSnapshot(true);
assertTrue(peer.isSkipLeaderStartupSnapshot());
peer.setSkipLeaderStartupSnapshot(false);
assertFalse(peer.isSkipLeaderStartupSnapshot());
}

}
Loading
Loading