@@ -140,14 +140,16 @@ async def _post_execution_cleanup(
140140 """Cleanup after task execution."""
141141 logger .debug ("Chainhook monitor task cleanup completed" )
142142
143- def _is_chainhook_healthy (self , chainhook_uuid : str ) -> bool :
143+ def _is_chainhook_healthy (self , chainhook_uuid : str ) -> tuple [ bool , bool ] :
144144 """Check if a chainhook is in a healthy state by checking its status directly.
145145
146146 Args:
147147 chainhook_uuid: UUID of the chainhook to check
148148
149149 Returns:
150- bool: True if chainhook is healthy, False otherwise
150+ tuple[bool, bool]: (is_healthy, should_recreate)
151+ - is_healthy: True if chainhook is healthy
152+ - should_recreate: True if chainhook should be recreated (permanent failure)
151153 """
152154 try :
153155 # Get the specific chainhook status
@@ -156,7 +158,7 @@ def _is_chainhook_healthy(self, chainhook_uuid: str) -> bool:
156158 # Check if chainhook is enabled
157159 if not status_response .get ("enabled" , False ):
158160 logger .warning (f"Chainhook { chainhook_uuid } is not enabled" )
159- return False
161+ return False , True # Not healthy, should recreate
160162
161163 # Check status type for any failure indicators
162164 status_info = status_response .get ("status" , {})
@@ -166,7 +168,7 @@ def _is_chainhook_healthy(self, chainhook_uuid: str) -> bool:
166168 logger .warning (
167169 f"Chainhook { chainhook_uuid } has status type: { status_type } "
168170 )
169- return False
171+ return False , True # Not healthy, should recreate
170172
171173 # Additional checks on status info if available
172174 info = status_info .get ("info" , {})
@@ -179,12 +181,16 @@ def _is_chainhook_healthy(self, chainhook_uuid: str) -> bool:
179181 logger .warning (
180182 f"Chainhook { chainhook_uuid } has expired (expired_at: { expired_at } , last_evaluated: { last_evaluated } )"
181183 )
182- return False
184+ return False , True # Not healthy, should recreate
183185
184- return True
186+ return True , False # Healthy, no need to recreate
185187 except Exception as e :
186- logger .error (f"Error checking chainhook { chainhook_uuid } health: { str (e )} " )
187- return False
188+ # This is likely a temporary failure (network, API timeout, etc.)
189+ # Don't recreate the chainhook, just log the error and try again later
190+ logger .warning (
191+ f"Temporary error checking chainhook { chainhook_uuid } health: { str (e )} "
192+ )
193+ return False , False # Not healthy (unknown), but don't recreate
188194
189195 def _recreate_chainhook_for_chain_state (self , chain_state ) -> Optional [str ]:
190196 """Recreate a chainhook for a given chain state.
@@ -329,33 +335,43 @@ async def _execute_impl(self, context: JobContext) -> List[ChainhookMonitorResul
329335 )
330336
331337 # Check if chainhook is healthy using direct status check
332- if not self ._is_chainhook_healthy (chainhook_uuid ):
338+ is_healthy , should_recreate = self ._is_chainhook_healthy (chainhook_uuid )
339+
340+ if not is_healthy :
333341 logger .warning (
334- f"Chainhook { chainhook_uuid } is unhealthy or not found "
342+ f"Chainhook { chainhook_uuid } is unhealthy (should_recreate= { should_recreate } ) "
335343 )
336344 chainhooks_failed += 1
337345 failed_chainhook_ids .append (chainhook_uuid )
338346
339- # Try to recreate the chainhook
340- new_uuid = self ._recreate_chainhook_for_chain_state (chain_state )
341- if new_uuid :
342- chainhooks_recreated += 1
343- recreated_chainhook_ids .append (new_uuid )
347+ # Only recreate if it's a permanent failure, not a temporary one
348+ if should_recreate :
344349 logger .info (
345- f"Successfully recreated chainhook { new_uuid } to replace unhealthy { chainhook_uuid } "
350+ f"Recreating chainhook { chainhook_uuid } due to permanent failure "
346351 )
352+ new_uuid = self ._recreate_chainhook_for_chain_state (chain_state )
353+ if new_uuid :
354+ chainhooks_recreated += 1
355+ recreated_chainhook_ids .append (new_uuid )
356+ logger .info (
357+ f"Successfully recreated chainhook { new_uuid } to replace failed { chainhook_uuid } "
358+ )
347359
348- # Delete the old chainhook if it exists
349- try :
350- self .platform_api .delete_chainhook (chainhook_uuid )
351- logger .info (f"Deleted old chainhook { chainhook_uuid } " )
352- except Exception as e :
353- logger .warning (
354- f"Failed to delete old chainhook { chainhook_uuid } : { str (e )} "
360+ # Delete the old chainhook if it exists
361+ try :
362+ self .platform_api .delete_chainhook (chainhook_uuid )
363+ logger .info (f"Deleted old chainhook { chainhook_uuid } " )
364+ except Exception as e :
365+ logger .warning (
366+ f"Failed to delete old chainhook { chainhook_uuid } : { str (e )} "
367+ )
368+ else :
369+ logger .error (
370+ f"Failed to recreate chainhook for chain state { chain_state .id } "
355371 )
356372 else :
357- logger .error (
358- f"Failed to recreate chainhook for chain state { chain_state . id } "
373+ logger .info (
374+ f"Skipping recreation of chainhook { chainhook_uuid } - likely temporary failure "
359375 )
360376 else :
361377 logger .debug (f"Chainhook { chainhook_uuid } is healthy" )
0 commit comments