@@ -354,7 +354,7 @@ void ExportPartitionTaskScheduler::handlePartExportFailure(
354354 size_t max_retries
355355)
356356{
357- LOG_INFO (storage.log , " ExportPartition scheduler task: Part {} export failed, will now increment counters " , part_name);
357+ LOG_INFO (storage.log , " ExportPartition scheduler task: Part {} export failed" , part_name);
358358
359359 if (!exception)
360360 {
@@ -381,7 +381,33 @@ void ExportPartitionTaskScheduler::handlePartExportFailure(
381381 // / Early exit if the query was cancelled - no need to increment error counts
382382 if (exception->code () == ErrorCodes::QUERY_WAS_CANCELLED)
383383 {
384- zk->tryRemove (export_path / " locks" / part_name, locked_by_stat.version );
384+ // / Releasing the lock is important because a query can be cancelled due to SYSTEM STOP MOVES. If this is the case,
385+ // / other replicas should still be able to export this individual part. That's why there is a retry loop here.
386+ // / It is very unlikely this will be a problem in practice. The lock is ephemeral, which means it is automatically released
387+ // / if ClickHouse loses connection to ZooKeeper
388+ std::size_t retry_count = 0 ;
389+ static constexpr std::size_t max_lock_release_retries = 3 ;
390+ while (retry_count < max_lock_release_retries)
391+ {
392+ ProfileEvents::increment (ProfileEvents::ExportPartitionZooKeeperRequests);
393+ ProfileEvents::increment (ProfileEvents::ExportPartitionZooKeeperRemove);
394+
395+ const auto removal_code = zk->tryRemove (export_path / " locks" / part_name, locked_by_stat.version );
396+
397+ if (Coordination::Error::ZOK == removal_code)
398+ {
399+ break ;
400+ }
401+
402+ if (Coordination::Error::ZBADVERSION == removal_code)
403+ {
404+ LOG_INFO (storage.log , " ExportPartition scheduler task: Part {} lock version mismatch, will not increment error counts" , part_name);
405+ break ;
406+ }
407+
408+ retry_count++;
409+ }
410+
385411 LOG_INFO (storage.log , " ExportPartition scheduler task: Part {} export was cancelled, skipping error handling" , part_name);
386412 return ;
387413 }
0 commit comments