11import asyncio
22import contextlib
33import dataclasses
4- import itertools
54import logging
65import typing
76
7+ from aiokafka .errors import CommitFailedError , KafkaError
88from faststream .kafka import TopicPartition
99
1010
@@ -38,6 +38,7 @@ def __init__(
3838 self ._messages_queue : asyncio .Queue [KafkaCommitTask ] = asyncio .Queue ()
3939 self ._commit_task : asyncio .Task [typing .Any ] | None = None
4040 self ._flush_batch_event = asyncio .Event ()
41+ self ._stop_requested : bool = False
4142
4243 self ._commit_batch_timeout_sec = commit_batch_timeout_sec
4344 self ._commit_batch_size = commit_batch_size
@@ -77,11 +78,11 @@ async def _populate_commit_batch(self) -> tuple[list[KafkaCommitTask], bool]:
7778 else :
7879 queue_get_task .cancel ()
7980
80- # commit_all was called — flush remaining queue items and stop
81+ # flush event — drain remaining queue items; stop only if close() was called
8182 if flush_wait_task in done :
8283 uncommited_tasks .extend (self ._flush_tasks_queue ())
8384 self ._flush_batch_event .clear ()
84- should_shutdown = True
85+ should_shutdown = self . _stop_requested
8586 break
8687
8788 if timeout_task in done :
@@ -104,25 +105,35 @@ async def _call_committer(
104105 ) -> bool :
105106 if not partitions_to_offsets :
106107 return True
107- commit_succeeded = True
108108 consumer : typing .Final [AIOKafkaConsumer ] = tasks_batch [0 ].consumer
109109 try :
110110 await consumer .commit (partitions_to_offsets )
111- except Exception as exc :
112- commit_succeeded = False
113- logger .exception ("Error during commit to kafka" , exc_info = exc )
111+ except CommitFailedError :
112+ # Partition reassignment in progress — safe to ignore, offsets will be re-committed
113+ logger .exception ("Cannot commit due to rebalancing, ignoring batch" )
114+ return False
115+ except KafkaError :
116+ # Transient error — re-queue batch for retry on next cycle
117+ logger .exception ("Error during commit to kafka, re-queuing batch" )
114118 for task in tasks_batch :
115119 await self ._messages_queue .put (task )
116- return commit_succeeded
120+ return False
121+ else :
122+ return True
117123
118124 @staticmethod
119125 def _map_offsets_per_partition (consumer_tasks : list [KafkaCommitTask ]) -> dict [TopicPartition , int ]:
120- partitions_to_tasks = itertools .groupby (
121- sorted (consumer_tasks , key = lambda x : x .topic_partition ), lambda x : x .topic_partition
122- )
126+ by_partition : dict [TopicPartition , list [KafkaCommitTask ]] = {}
127+ for task in consumer_tasks :
128+ by_partition .setdefault (task .topic_partition , []).append (task )
129+
123130 partitions_to_offsets : dict [TopicPartition , int ] = {}
124- for partition , partition_tasks in partitions_to_tasks :
125- max_offset = max ((task .offset for task in partition_tasks ), default = None )
131+ for partition , tasks in by_partition .items ():
132+ max_offset : int | None = None
133+ for task in sorted (tasks , key = lambda x : x .offset ):
134+ if task .asyncio_task .cancelled ():
135+ break # stop committing at first cancelled task — message was not processed
136+ max_offset = task .offset
126137 if max_offset is not None :
127138 # Kafka commits the *next* offset to fetch, so committed = processed_max + 1
128139 partitions_to_offsets [partition ] = max_offset + 1
@@ -133,7 +144,7 @@ async def _commit_tasks_batch(self, tasks_batch: list[KafkaCommitTask]) -> bool:
133144 * [task .asyncio_task for task in tasks_batch ], return_exceptions = True
134145 )
135146 for result in results :
136- if isinstance (result , BaseException ):
147+ if isinstance (result , BaseException ) and not isinstance ( result , asyncio . CancelledError ) :
137148 logger .error ("Task has finished with an exception" , exc_info = result )
138149
139150 # Group by consumer instance — each AIOKafkaConsumer can only commit its own partitions
@@ -159,15 +170,17 @@ async def _run_commit_process(self) -> None:
159170 await self ._commit_tasks_batch (commit_batch )
160171
161172 async def commit_all (self ) -> None :
162- """Flush and commit all pending tasks, then stop the committer loop."""
173+ """Flush and commit all pending tasks without stopping the committer loop.
174+
175+ Safe to call during Kafka rebalance (on_partitions_revoked). The committer
176+ continues running after this returns.
177+ """
163178 self ._flush_batch_event .set ()
164179 await self ._messages_queue .join ()
165180
166181 async def send_task (self , new_task : KafkaCommitTask ) -> None :
167182 self ._check_is_commit_task_running ()
168- await self ._messages_queue .put (
169- new_task ,
170- )
183+ await self ._messages_queue .put (new_task )
171184
172185 def spawn (self ) -> None :
173186 if not self ._commit_task :
@@ -176,11 +189,12 @@ def spawn(self) -> None:
176189 logger .error ("Committer main task already running" )
177190
178191 async def close (self ) -> None :
179- """Close committer."""
192+ """Flush all pending tasks and shut down the committer."""
180193 if not self ._commit_task :
181194 logger .error ("Committer main task is not running, cannot close committer properly" )
182195 return
183196
197+ self ._stop_requested = True
184198 self ._flush_batch_event .set ()
185199 try :
186200 await asyncio .wait_for (self ._commit_task , timeout = self ._shutdown_timeout )
0 commit comments