Skip to content

Commit 4b45347

Browse files
committed
fix: detected issues
1 parent 6c3db52 commit 4b45347

5 files changed

Lines changed: 22 additions & 16 deletions

File tree

.github/actions/e2e-ready/action.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,12 @@ runs:
3030
shell: bash
3131
run: |
3232
KUEUE_VERSION="${KUEUE_VERSION:-v0.16.1}"
33-
kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml"
33+
KUEUE_MANIFEST_SHA256="${KUEUE_MANIFEST_SHA256:-3201a66ff731be440ecfcf3c0fa5979d001b834f68389208fe7ee18017fbcfe8}"
34+
KUEUE_MANIFEST="/tmp/kueue-manifests.yaml"
35+
curl -fsSL -o "$KUEUE_MANIFEST" "https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml"
36+
echo "${KUEUE_MANIFEST_SHA256} ${KUEUE_MANIFEST}" | sha256sum -c -
37+
kubectl apply --server-side -f "$KUEUE_MANIFEST"
38+
rm -f "$KUEUE_MANIFEST"
3439
kubectl wait --for=condition=Available --timeout=120s \
3540
deployment/kueue-controller-manager -n kueue-system
3641
kubectl apply --server-side -f - <<'EOF'

.github/workflows/stack-tests.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ env:
3232
K3S_VERSION: v1.32.11+k3s1
3333
K3S_INSTALL_SHA256: d75e014f2d2ab5d30a318efa5c326f3b0b7596f194afcff90fa7a7a91166d5f7
3434
KUEUE_VERSION: v0.16.1
35+
KUEUE_MANIFEST_SHA256: 3201a66ff731be440ecfcf3c0fa5979d001b834f68389208fe7ee18017fbcfe8
3536

3637
jobs:
3738
# Fast unit tests (no infrastructure needed)

backend/app/services/pod_monitor/monitor.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -183,11 +183,9 @@ async def _process_pod_event(self, event: PodEvent) -> None:
183183
# Map to application events
184184
app_events = await self._event_mapper.map_pod_event(event.pod, event.event_type)
185185

186-
# Publish events
187186
for app_event in app_events:
188187
await self._publish_event(app_event, event.pod)
189188

190-
# Delete pod once all data has been extracted and terminal events published
191189
if any(e.event_type in _TERMINAL_EVENT_TYPES for e in app_events):
192190
await self._delete_pod(event.pod)
193191

@@ -207,22 +205,18 @@ async def _process_pod_event(self, event: PodEvent) -> None:
207205

208206
async def _publish_event(self, event: DomainEvent, pod: k8s_client.V1Pod) -> None:
209207
"""Publish event to Kafka and store in events collection."""
210-
try:
211-
execution_id = getattr(event, "execution_id", None) or event.aggregate_id
212-
key = str(execution_id or (pod.metadata.name if pod.metadata else "unknown"))
213-
214-
await self._kafka_event_service.publish_event(event=event, key=key)
208+
execution_id = getattr(event, "execution_id", None) or event.aggregate_id
209+
key = str(execution_id or (pod.metadata.name if pod.metadata else "unknown"))
215210

216-
phase = pod.status.phase if pod.status else "Unknown"
217-
self._metrics.record_pod_monitor_event_published(event.event_type, phase)
211+
await self._kafka_event_service.publish_event(event=event, key=key)
218212

219-
except Exception as e:
220-
self.logger.error(f"Error publishing event: {e}", exc_info=True)
213+
phase = pod.status.phase if pod.status else "Unknown"
214+
self._metrics.record_pod_monitor_event_published(event.event_type, phase)
221215

222216
async def _delete_pod(self, pod: k8s_client.V1Pod) -> None:
223217
"""Delete a pod after its data has been fully extracted.
224218
225-
Frees the ResourceQuota slot so new executor pods can be scheduled.
219+
Frees the Kueue quota slot so gated executor pods can be admitted.
226220
The ConfigMap is garbage-collected automatically via ownerReference.
227221
"""
228222
pod_name = pod.metadata.name

backend/tests/unit/services/pod_monitor/test_monitor.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,5 +403,6 @@ async def produce(
403403
pod = make_pod(name="no-meta-pod", phase="Pending")
404404
pod.metadata = None # type: ignore[assignment]
405405

406-
# Should not raise - errors are caught and logged
407-
await pm._publish_event(event, pod)
406+
# Exception propagates — _process_pod_event's broad except handles it
407+
with pytest.raises(RuntimeError, match="Publish failed"):
408+
await pm._publish_event(event, pod)

cert-generator/setup-k8s.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,13 @@ kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -
9191

9292
# Install Kueue (scheduling-gate based quota management)
9393
KUEUE_VERSION="${KUEUE_VERSION:-v0.16.1}"
94+
KUEUE_MANIFEST_SHA256="${KUEUE_MANIFEST_SHA256:-3201a66ff731be440ecfcf3c0fa5979d001b834f68389208fe7ee18017fbcfe8}"
9495
echo "Installing Kueue ${KUEUE_VERSION}..."
95-
kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml"
96+
KUEUE_MANIFEST="/tmp/kueue-manifests.yaml"
97+
curl -fsSL -o "$KUEUE_MANIFEST" "https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml"
98+
echo "${KUEUE_MANIFEST_SHA256} ${KUEUE_MANIFEST}" | sha256sum -c -
99+
kubectl apply --server-side -f "$KUEUE_MANIFEST"
100+
rm -f "$KUEUE_MANIFEST"
96101
kubectl wait --for=condition=Available --timeout=120s \
97102
deployment/kueue-controller-manager -n kueue-system
98103

0 commit comments

Comments
 (0)