decoupling from python-metrics calculation (moved to strict shell-only), moving of metrics processing from k8s service (only for k8s pods spinning/starting/returning data) to exec service

HardMax71 · HardMax71 · commit 6c5d3bc3f2bf · 2025-06-29T16:01:49.000+02:00
diff --git a/backend/app/scripts/entrypoint.py b/backend/app/scripts/entrypoint.py
diff --git a/backend/app/scripts/entrypoint.sh b/backend/app/scripts/entrypoint.sh
@@ -0,0 +1,84 @@
+#!/bin/sh
+
+# This script is written in the strictest, most portable POSIX sh.
+# It makes zero assumptions about the shell's features.
+
+# Use a simple, POSIX-compliant method for escaping JSON strings.
+json_escape() {
+    sed -e ':a;N;$!ba' -e 's/\\/\\\\/g' -e 's/"/\\"/g' -e 's/\n/\\n/g' -e 's/\t/\\t/g' -e 's/\r/\\r/g'
+}
+
+# Exit immediately if no command is provided.
+if [ "$#" -eq 0 ]; then
+    printf '{"exit_code": 127, "resource_usage": null, "stdout": "", "stderr": "Entrypoint Error: No command provided."}'
+    exit 0
+fi
+
+# Create temporary files for stdout and stderr. Exit if mktemp fails.
+set -e
+OUT=$(mktemp)
+ERR=$(mktemp)
+trap 'rm -f "$OUT" "$ERR"' EXIT
+set +e
+
+# Record start time using nanosecond precision (%s for seconds, %N for nanoseconds).
+START_TIME=$(date +%s.%N)
+
+# This subshell construct is the key. It isolates the user command.
+# A failure inside this subshell will not crash our main script.
+( "$@" >"$OUT" 2>"$ERR" ) &
+PID=$!
+
+PEAK_KB=0
+JIFS=0
+
+# Loop while the process directory exists. This is the most reliable check.
+while [ -d "/proc/$PID" ]; do
+    # Silence all errors from grep/cat to prevent log contamination from race conditions.
+    CUR_KB=$(grep VmHWM "/proc/$PID/status" 2>/dev/null | awk '{print $2}')
+    if [ -n "$CUR_KB" ] && [ "$CUR_KB" -gt "$PEAK_KB" ]; then
+        PEAK_KB=$CUR_KB
+    fi
+
+    # Use awk for arithmetic; it handles empty/malformed input without crashing the shell.
+    CPU_JIFFIES=$(awk '{print $14 + $15}' "/proc/$PID/stat" 2>/dev/null)
+    if [ -n "$CPU_JIFFIES" ]; then
+        JIFS=$CPU_JIFFIES
+    fi
+    sleep 0.05
+done
+
+# This will now work correctly because the parent script is not PID 1.
+wait "$PID"
+EXIT_CODE=$?
+
+END_TIME=$(date +%s.%N)
+# Calculate elapsed time using floating-point math via awk, formatted to 6 decimal places.
+# The result is stored in ELAPSED_S, which the original JSON block uses.
+ELAPSED_S=$(printf '%s\n' "$END_TIME $START_TIME" | awk '{printf "%.6f", $1 - $2}')
+
+# Defensively get clock ticks per second.
+CLK_TCK=$(getconf CLK_TCK 2>/dev/null || printf "100")
+
+OUT_JSON=$(cat "$OUT" | json_escape)
+ERR_JSON=$(cat "$ERR" | json_escape)
+
+# Use the most robust printf format possible, passing all variables as arguments.
+# The final output has NO trailing newline. This is critical.
+json=$(cat <<EOF
+{
+  "exit_code": ${EXIT_CODE:-1},
+  "resource_usage": {
+    "execution_time_wall_seconds": ${ELAPSED_S:-0},
+    "cpu_time_jiffies": ${JIFS:-0},
+    "clk_tck_hertz": ${CLK_TCK:-100},
+    "peak_memory_kb": ${PEAK_KB:-0}
+  },
+  "stdout": "$(cat "$OUT" | json_escape)",
+  "stderr": "$(cat "$ERR" | json_escape)"
+}
+EOF
+)
+
+# final output – no trailing newline!
+printf '%s' "$json"
diff --git a/backend/app/services/execution_service.py b/backend/app/services/execution_service.py
@@ -114,56 +114,61 @@ async def _get_k8s_execution_output(
         return output, error_msg, phase, resource_usage
 
     async def _try_finalize_execution(self, execution: ExecutionInDB) -> Optional[ExecutionInDB]:
-        """
-        Checks K8s for a final status. If found, updates the database and
-        returns the updated execution object. Otherwise, returns None.
-        """
-        output, _, final_phase, resources = await self._get_k8s_execution_output(execution.id)
+        try:
+            metrics, final_phase = await self.k8s_service.get_pod_logs(execution.id)
+        except KubernetesPodError as e:
+            logger.error(f"K8s pod error finalizing execution {execution.id}: {e}")
+            update_data = {"status": ExecutionStatus.ERROR, "errors": str(e), "resource_usage": {"pod_phase": "Error"}}
+        except Exception as e:
+            logger.error(f"Unexpected error finalizing execution {execution.id}: {e}", exc_info=True)
+            update_data = {"status": ExecutionStatus.ERROR, "errors": f"Unexpected infrastructure error: {e}",
+                           "resource_usage": {"pod_phase": "Error"}}
+        else:
+            logger.info(f"Successfully parsed metrics from pod: {metrics}")
+
+            exit_code = metrics.get("exit_code")
+            res_usage = metrics.get("resource_usage")
+
+            if not res_usage:
+                return None  # waiting for results
 
-        update_data = {}
+            wall_s = res_usage.get("execution_time_wall_seconds") or 0
+            jiffies = float(res_usage.get("cpu_time_jiffies", 0) or 0)
+            hertz = float(res_usage.get("clk_tck_hertz", 100) or 100)
+            cpu_s = jiffies / hertz if hertz > 0 else 0.0  # total CPU-time
 
-        # Now we only have basic metrics from K8s
-        if resources:
-            exit_code = resources.get("exit_code", 1)
+            # average CPU in millicores: (CPU-seconds / wall-seconds) × 1000
+            cpu_millicores = (cpu_s / wall_s * 1000) if wall_s else 0.0
+
+            # VmHWM is k *ibi*bytes → MiB = KiB / 1024
+            peak_kib = float(res_usage.get("peak_memory_kb", 0) or 0)
+            peak_mib = peak_kib / 1024.0
+
+            final_resource_usage = {
+                "execution_time": round(wall_s, 6),
+                "cpu_usage": round(cpu_millicores, 2),
+                "memory_usage": round(peak_mib, 2),
+            }
+
+            final_resource_usage["pod_phase"] = final_phase
 
             if exit_code == 0:
                 update_data = {
                     "status": ExecutionStatus.COMPLETED,
-                    "output": output or "",
+                    "output": metrics.get("stdout", ""),
                     "errors": None,
-                    "resource_usage": resources  # Only has exit_code, execution_time, pod_phase
+                    "resource_usage": final_resource_usage,
                 }
             else:
-                # Script failed - output contains stderr/stdout
-                error_details = output or f"Script failed with exit code {exit_code}"
+                error_details = metrics.get("stderr") or f"Script failed with exit code {exit_code}."
                 update_data = {
                     "status": ExecutionStatus.ERROR,
-                    "output": "",
+                    "output": metrics.get("stdout", ""),
                     "errors": error_details,
-                    "resource_usage": resources
+                    "resource_usage": final_resource_usage,
                 }
-        else:
-            # No metrics at all - use pod phase
-            if final_phase == "Succeeded":
-                update_data = {
-                    "status": ExecutionStatus.COMPLETED,
-                    "output": output or "",
-                    "errors": None,
-                    "resource_usage": {"pod_phase": final_phase}
-                }
-            else:
-                error_details = output or f"Pod failed with phase '{final_phase}'"
-                update_data = {
-                    "status": ExecutionStatus.ERROR,
-                    "output": "",
-                    "errors": error_details,
-                    "resource_usage": {"pod_phase": final_phase}
-                }
-
-        if not update_data:
-            return None
 
-        logger.info(f"Finalizing execution {execution.id} with status: {update_data['status']}")
+        logger.info(f"Finalizing execution {execution.id} with status: {update_data.get('status', 'unknown')}")
         update_payload = ExecutionUpdate(**update_data).model_dump(exclude_unset=True)
         await self.execution_repo.update_execution(execution.id, update_payload)
 
@@ -219,8 +224,8 @@ async def execute_script(
                     ExecutionUpdate(status=ExecutionStatus.ERROR, errors=str(e)).model_dump(exclude_unset=True)
                 )
             raise IntegrationException(status_code=500,
-                                           detail=f"Internal server error during script execution request: "
-                                                  f"{str(e)}") from e
+                                       detail=f"Internal server error during script execution request: "
+                                              f"{str(e)}") from e
         finally:
             EXECUTION_DURATION.labels(python_version=python_version).observe(time() - start_time)
             ACTIVE_EXECUTIONS.dec()
diff --git a/backend/app/services/kubernetes_service.py b/backend/app/services/kubernetes_service.py
@@ -1,5 +1,5 @@
+import ast
 import asyncio
-import json
 import os
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
@@ -97,7 +97,11 @@ async def graceful_shutdown(self) -> None:
                 logger.warning("Shutdown timeout reached, forcing pod termination")
                 break
             try:
-                await self._cleanup_pod_resources(pod_name)
+                if not pod_name.startswith("execution-"):
+                    return
+                execution_id = pod_name[len("execution-"):]
+                config_map_name = f"script-{execution_id}"
+                await self._cleanup_resources(pod_name, config_map_name)
             except Exception as e:
                 logger.error(f"Error during pod cleanup on shutdown: {str(e)}")
 
@@ -156,18 +160,18 @@ async def create_execution_pod(
         pod_name = f"execution-{execution_id}"
 
         try:
-            entrypoint_script_path = Path("app/scripts/entrypoint.py")
+            entrypoint_script_path = Path("app/scripts/entrypoint.sh")
             entrypoint_code = await asyncio.to_thread(entrypoint_script_path.read_text)
 
-            config_map_data["entrypoint.py"] = entrypoint_code
+            config_map_data["entrypoint.sh"] = entrypoint_code
 
             config_map_body = k8s_client.V1ConfigMap(
                 metadata=k8s_client.V1ObjectMeta(name=config_map_name),
                 data=config_map_data
             )
             await self._create_config_map(config_map_body)
 
-            final_pod_command = ["/scripts/entrypoint.py"] + command
+            final_pod_command = ["/bin/sh", "/scripts/entrypoint.sh", *command]
 
             builder = PodManifestBuilder(
                 execution_id=execution_id,
@@ -193,44 +197,34 @@ async def create_execution_pod(
             await self._cleanup_resources(pod_name, config_map_name)
             raise KubernetesPodError(f"Failed to create execution pod: {str(e)}") from e
 
-    async def get_pod_logs(self, execution_id: str) -> tuple[str, str, dict]:
-        # This method reverts to the simple version that parses the clean log output
+    async def get_pod_logs(self, execution_id: str) -> tuple[dict, str]:
         pod_name = f"execution-{execution_id}"
         config_map_name = f"script-{execution_id}"
-
         try:
             pod = await self._wait_for_pod_completion(pod_name)
             pod_phase = pod.status.phase if pod and pod.status else "Unknown"
             full_logs = await self._get_container_logs(pod_name, "script-runner")
+            logger.info(f"Raw logs from pod {pod_name}:\n---\n{full_logs}\n---")
 
-            # The simple, reliable parser for the ###METRICS### block
-            output, metrics = self._extract_execution_metrics(full_logs)
-
-            final_exit_code = metrics.get("exit_code", 1)
-            metrics["pod_phase"] = pod_phase
-            metrics["status"] = "completed" if final_exit_code == 0 else "error"
-
-            return output, pod_phase, metrics
+            try:
+                # https://stackoverflow.com/questions/15197673/using-pythons-eval-vs-ast-literal-eval
+                metrics = ast.literal_eval(full_logs)
+                return metrics, pod_phase
+            except (ValueError, SyntaxError, TypeError) as e:
+                logger.error(f"FAILED TO PARSE LOGS FROM POD {pod_name} as a Python literal: {e}")
+                error_payload = {
+                    "exit_code": -1,
+                    "stdout": "",
+                    "stderr": f"Internal execution error: Pod logs were not valid JSON. "
+                              f"Pod phase: {pod_phase}.\nRaw Logs:\n{full_logs}",
+                    "resource_usage": None,
+                }
+                return error_payload, pod_phase
         finally:
             logger.info(f"Initiating cleanup for execution '{execution_id}'...")
             await self._cleanup_resources(pod_name, config_map_name)
             self._active_pods.pop(execution_id, None)
 
-    def _extract_execution_metrics(self, logs: str) -> tuple[str, dict]:
-        # This is the simple parser for the entrypoint.py output
-        split_marker = "\n###METRICS###\n"
-        if split_marker in logs:
-            output, metrics_json = logs.rsplit(split_marker, 1)
-            try:
-                metrics_data = json.loads(metrics_json)
-                return output.strip(), metrics_data
-            except json.JSONDecodeError:
-                logger.error(f"Failed to decode metrics JSON: {metrics_json}")
-                return logs.strip(), {"error": "Failed to decode metrics JSON.", "exit_code": 1}
-
-        logger.warning("Metrics marker not found in logs.")
-        return logs.strip(), {"error": "Metrics marker not found in logs.", "exit_code": 1}
-
     async def _wait_for_pod_completion(self, pod_name: str) -> k8s_client.V1Pod:
         logger.info(f"Waiting for pod '{pod_name}' to complete...")
         for _ in range(self.POD_RETRY_ATTEMPTS):
@@ -292,13 +286,6 @@ async def _cleanup_resources(self, pod_name: str, config_map_name: str) -> None:
         except ApiException as e:
             logger.error(f"Failed to delete config map '{config_map_name}': {e.reason}")
 
-    async def _cleanup_pod_resources(self, pod_name: str) -> None:
-        if not pod_name.startswith("execution-"):
-            return
-        execution_id = pod_name[len("execution-"):]
-        config_map_name = f"script-{execution_id}"
-        await self._cleanup_resources(pod_name, config_map_name)
-
 
 def get_k8s_manager(request: Request) -> KubernetesServiceManager:
     if not hasattr(request.app.state, "k8s_manager"):
diff --git a/backend/app/services/pod_manifest_builder.py b/backend/app/services/pod_manifest_builder.py
@@ -44,6 +44,7 @@ def build(self) -> Dict[str, Any]:
                         "name": "script-runner",
                         "image": self.image,
                         "command": self.command,
+                        "args": [],
                         "resources": {
                             "limits": {"cpu": self.pod_cpu_limit, "memory": self.pod_memory_limit},
                             "requests": {"cpu": self.pod_cpu_request, "memory": self.pod_memory_request},