Skip to content

Commit 022d159

Browse files
committed
retry lock release and update log
1 parent 4fef9d7 commit 022d159

1 file changed

Lines changed: 28 additions & 2 deletions

File tree

src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ void ExportPartitionTaskScheduler::handlePartExportFailure(
354354
size_t max_retries
355355
)
356356
{
357-
LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} export failed, will now increment counters", part_name);
357+
LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} export failed", part_name);
358358

359359
if (!exception)
360360
{
@@ -381,7 +381,33 @@ void ExportPartitionTaskScheduler::handlePartExportFailure(
381381
/// Early exit if the query was cancelled - no need to increment error counts
382382
if (exception->code() == ErrorCodes::QUERY_WAS_CANCELLED)
383383
{
384-
zk->tryRemove(export_path / "locks" / part_name, locked_by_stat.version);
384+
/// Releasing the lock is important because a query can be cancelled due to SYSTEM STOP MOVES. If this is the case,
385+
/// other replicas should still be able to export this individual part. That's why there is a retry loop here.
386+
/// It is very unlikely this will be a problem in practice. The lock is ephemeral, which means it is automatically released
387+
/// if ClickHouse loses connection to ZooKeeper
388+
std::size_t retry_count = 0;
389+
static constexpr std::size_t max_lock_release_retries = 3;
390+
while (retry_count < max_lock_release_retries)
391+
{
392+
ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRequests);
393+
ProfileEvents::increment(ProfileEvents::ExportPartitionZooKeeperRemove);
394+
395+
const auto removal_code = zk->tryRemove(export_path / "locks" / part_name, locked_by_stat.version);
396+
397+
if (Coordination::Error::ZOK == removal_code)
398+
{
399+
break;
400+
}
401+
402+
if (Coordination::Error::ZBADVERSION == removal_code)
403+
{
404+
LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} lock version mismatch, will not increment error counts", part_name);
405+
break;
406+
}
407+
408+
retry_count++;
409+
}
410+
385411
LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} export was cancelled, skipping error handling", part_name);
386412
return;
387413
}

0 commit comments

Comments
 (0)