Skip to content

Commit de03298

Browse files
committed
fix: deleting pods after any end (e.g. success/fail/timeout) in monitor
1 parent f9d32c2 commit de03298

1 file changed

Lines changed: 30 additions & 0 deletions

File tree

backend/app/services/pod_monitor/monitor.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,22 @@
66
import structlog
77
from kubernetes_asyncio import client as k8s_client
88
from kubernetes_asyncio import watch as k8s_watch
9+
from kubernetes_asyncio.client.rest import ApiException
910

1011
from app.core.metrics import KubernetesMetrics
1112
from app.core.utils import StringEnum
13+
from app.domain.enums import EventType
1214
from app.domain.events import DomainEvent
1315
from app.services.kafka_event_service import KafkaEventService
1416
from app.services.pod_monitor.config import PodMonitorConfig
1517
from app.services.pod_monitor.event_mapper import PodEventMapper, WatchEventType
1618

19+
_TERMINAL_EVENT_TYPES: frozenset[str] = frozenset({
20+
EventType.EXECUTION_COMPLETED,
21+
EventType.EXECUTION_FAILED,
22+
EventType.EXECUTION_TIMEOUT,
23+
})
24+
1725
# Type aliases
1826
type ResourceVersion = str
1927
type KubeEvent = dict[str, Any]
@@ -179,6 +187,10 @@ async def _process_pod_event(self, event: PodEvent) -> None:
179187
for app_event in app_events:
180188
await self._publish_event(app_event, event.pod)
181189

190+
# Delete pod once all data has been extracted and terminal events published
191+
if any(e.event_type in _TERMINAL_EVENT_TYPES for e in app_events):
192+
await self._delete_pod(event.pod)
193+
182194
if app_events:
183195
self.logger.info(
184196
f"Processed {event.event_type} event for pod {pod_name} "
@@ -206,3 +218,21 @@ async def _publish_event(self, event: DomainEvent, pod: k8s_client.V1Pod) -> Non
206218

207219
except Exception as e:
208220
self.logger.error(f"Error publishing event: {e}", exc_info=True)
221+
222+
async def _delete_pod(self, pod: k8s_client.V1Pod) -> None:
223+
"""Delete a pod after its data has been fully extracted.
224+
225+
Frees the ResourceQuota slot so new executor pods can be scheduled.
226+
The ConfigMap is garbage-collected automatically via ownerReference.
227+
"""
228+
pod_name = pod.metadata.name
229+
try:
230+
await self._v1.delete_namespaced_pod(
231+
name=pod_name, namespace=pod.metadata.namespace, grace_period_seconds=0,
232+
)
233+
self.logger.info(f"Deleted completed pod {pod_name}")
234+
except ApiException as e:
235+
if e.status == 404:
236+
self.logger.debug(f"Pod {pod_name} already deleted")
237+
else:
238+
self.logger.warning(f"Failed to delete pod {pod_name}: {e.reason}")

0 commit comments

Comments
 (0)