diff --git a/helm/blueapi/README.md b/helm/blueapi/README.md index 4ece387b7..94093e826 100644 --- a/helm/blueapi/README.md +++ b/helm/blueapi/README.md @@ -32,6 +32,7 @@ A Helm chart deploying a worker pod that runs Bluesky plans | podAnnotations | object | `{}` | | | podLabels | object | `{}` | | | podSecurityContext | object | `{}` | | +| pvcAutoDeletion | object | `{"enabled":true}` | If enabled, runs a weekly CronJob that deletes blueapi scratch PVCs unused for more than 3 months. To protect a PVC from deletion, set the annotation ""protected" to "true" on it. | | readinessProbe | object | `{"failureThreshold":2,"httpGet":{"path":"/healthz","port":"http"},"periodSeconds":10}` | Readiness probe, if configured kubernetes will not route traffic to this pod if failed consecutively. This could allow the service time to recover if it is being overwhelmed by traffic, but without the to ability to load balance or scale up/outwards, upstream services will need to know to back off. This is automatically disabled when in debug mode. | | resources | object | `{"limits":{"cpu":"2000m","memory":"4000Mi"},"requests":{"cpu":"200m","memory":"400Mi"}}` | Sets the compute resources available to the pod. These defaults are appropriate when using debug mode or an internal PVC and therefore running VS Code server in the pod. In the Diamond cluster, requests must be >= 0.1*limits When not using either of the above, the limits may be lowered. When idle but connected, blueapi consumes ~400MB of memory and 1% cpu and may struggle when allocated less. | | restartOnConfigChange | bool | `true` | If enabled the blueapi pod will restart on changes to `worker` | @@ -44,6 +45,7 @@ A Helm chart deploying a worker pod that runs Bluesky plans | serviceAccount.create | bool | `false` | | | serviceAccount.name | string | `""` | | | startupProbe | object | `{"failureThreshold":5,"httpGet":{"path":"/healthz","port":"http"},"periodSeconds":10}` | A more lenient livenessProbe to allow the service to start fully. This is automatically disabled when in debug mode. | +| timeStampCron | object | `{"enabled":true}` | If enabled, runs a daily CronJob that stamps blueapi scratch PVCs with a last-used annotation when mounted by a running pod | | tolerations | list | `[]` | May be required to run on specific nodes (e.g. the control machine) | | tracing | object | `{"fastapi":{"excludedURLs":"/healthz"},"otlp":{"enabled":false,"protocol":"http/protobuf","server":{"host":"http://opentelemetry-collector.tracing","port":4318}}}` | Exclude health probe requests from tracing by default to prevent spamming | | volumeMounts | list | `[]` | Additional volumeMounts on the output StatefulSet definition. Define how volumes are mounted to the container referenced by using the same name. | diff --git a/helm/blueapi/files/scripts/pvc-deletion.sh b/helm/blueapi/files/scripts/pvc-deletion.sh new file mode 100644 index 000000000..44a04adc5 --- /dev/null +++ b/helm/blueapi/files/scripts/pvc-deletion.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -eou pipefail +# Get all PVCs by running pods +ALL_PVCS=$(kubectl get pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | sort -u) +BLUEAPI_PVCS=$( echo $ALL_PVCS | tr ' ' '\n' | grep "^$RELEASE_FULLNAME-scratch-" || true) +NOW=$(date +%s) +#loop through all pvcs. +for pvc in $BLUEAPI_PVCS; do + #check if pvc has last-used annotation + #get last used annotation + LAST_USED=$(kubectl get pvc "$pvc" -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}') + #checking if its not null + if [ -n "$LAST_USED" ]; then + #check if last_used is older than 3 months + if [ $(($NOW - LAST_USED)) -gt 7884000 ]; then + #checking if the pvc is protected, if it is protected skip deletion + if [ "$(kubectl get pvc "$pvc" -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.protected}')" = "true" ]; then + echo " PVC $pvc is protected, skipping deletion" + continue + fi + #PVC has not been used for more than three months, delete it + kubectl delete pvc "$pvc" -n $RELEASE_NAMESPACE --wait=true + fi + else + echo " $pvc has no last-used annotation" + fi +done diff --git a/helm/blueapi/files/scripts/time-stamper.sh b/helm/blueapi/files/scripts/time-stamper.sh new file mode 100644 index 000000000..2f5d0a4b4 --- /dev/null +++ b/helm/blueapi/files/scripts/time-stamper.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -eou pipefail +# Get all PVCs currently mounted by running pods +MOUNTED_PVCS=$(kubectl get pods -n $RELEASE_NAMESPACE \ + -o=jsonpath='{.items[*].spec.volumes[*].persistentVolumeClaim.claimName}' | tr ' ' '\n' | sort -u) +BLUEAPI_PVCS=$( echo $MOUNTED_PVCS | tr ' ' '\n' | grep "^$RELEASE_FULLNAME-scratch-"|| true) +#loop through all the pvcs annotating ones thare are mounted +NOW=$(date +%s) +for pvc in $BLUEAPI_PVCS; do + kubectl annotate --overwrite pvc "$pvc" -n $RELEASE_NAMESPACE last-used="$NOW" +done diff --git a/helm/blueapi/templates/cronjob-configmaps.yaml b/helm/blueapi/templates/cronjob-configmaps.yaml new file mode 100644 index 000000000..188bb1a5f --- /dev/null +++ b/helm/blueapi/templates/cronjob-configmaps.yaml @@ -0,0 +1,22 @@ +{{- if .Values.timeStampCron.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name : {{include "blueapi.fullname" . }}-pvc-stamper-script +data: + {{- $files := .Files }} + time-stamper.sh: |- +{{ $files.Get "files/scripts/time-stamper.sh" | indent 4 }} +--- +{{- end }} + +{{- if .Values.pvcAutoDeletion.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name : {{include "blueapi.fullname" . }}-pvc-auto-deletion-script +data: + {{- $files := .Files }} + pvc-deletion.sh: |- +{{ $files.Get "files/scripts/pvc-deletion.sh" | indent 4 }} +{{- end }} diff --git a/helm/blueapi/templates/cronjob.yaml b/helm/blueapi/templates/cronjob.yaml new file mode 100644 index 000000000..17b5aedfc --- /dev/null +++ b/helm/blueapi/templates/cronjob.yaml @@ -0,0 +1,173 @@ +{{- if .Values.timeStampCron.enabled }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +rules: +- apiGroups: [""] + resources: ["pods", "persistentvolumeclaims"] + verbs: ["get", "list", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +subjects: +- kind: ServiceAccount + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: {{ include "blueapi.fullname" . }}-last-used-stamper + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +spec: + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 1 + schedule: "@daily" + + jobTemplate: + spec: + # amount of attempts of labeling a pvc + backoffLimit: 3 + # job stops after 180 seconds + activeDeadlineSeconds: 180 + template: + spec: + serviceAccountName: {{ include "blueapi.fullname" . }}-last-used-stamper + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: {{include "blueapi.fullname" . }}-pvc-stamper-script + configMap: + name: {{include "blueapi.fullname" . }}-pvc-stamper-script + defaultMode: 0555 + containers: + - name: last-used-stamper + env: + - name: RELEASE_NAME + value: {{ .Release.Name }} + - name: RELEASE_NAMESPACE + value: {{ .Release.Namespace }} + - name: RELEASE_FULLNAME + value: {{include "blueapi.fullname" . }} + volumeMounts: + - name: {{include "blueapi.fullname" . }}-pvc-stamper-script + mountPath: /scripts + image: rancher/kubectl@sha256:05d2b313e2f397e0ade252136aed47abd72d56ead11d1b027ac70f66362c8495 # v1.36.0 + imagePullPolicy: IfNotPresent + command: ["/scripts/time-stamper.sh"] + restartPolicy: OnFailure +{{- end }} +{{- if .Values.pvcAutoDeletion.enabled }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +rules: +- apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "patch","delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +subjects: +- kind: ServiceAccount + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +spec: + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 1 + schedule: "@weekly" + + jobTemplate: + spec: + # amount of attempts for pvc deletion + backoffLimit: 3 + # job stops after 300 seconds + activeDeadlineSeconds: 300 + template: + spec: + serviceAccountName: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + configMap: + name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + defaultMode: 0555 + containers: + - name: pvc-auto-deletion + env: + - name: RELEASE_NAME + value: {{ .Release.Name }} + - name: RELEASE_NAMESPACE + value: {{ .Release.Namespace }} + - name: RELEASE_FULLNAME + value: {{include "blueapi.fullname" . }} + volumeMounts: + - name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + mountPath: /scripts + image: rancher/kubectl@sha256:05d2b313e2f397e0ade252136aed47abd72d56ead11d1b027ac70f66362c8495 # v1.36.0 + imagePullPolicy: IfNotPresent + command: ["/scripts/pvc-deletion.sh"] + restartPolicy: OnFailure +{{- end }} diff --git a/helm/blueapi/values.schema.json b/helm/blueapi/values.schema.json index 74deedadb..f3b9b95bd 100644 --- a/helm/blueapi/values.schema.json +++ b/helm/blueapi/values.schema.json @@ -174,6 +174,15 @@ "podSecurityContext": { "type": "object" }, + "pvcAutoDeletion": { + "description": "If enabled, runs a weekly CronJob that deletes blueapi scratch PVCs unused for more than 3 months. To protect a PVC from deletion, set the annotation \"\"protected\" to \"true\" on it.", + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + }, "readinessProbe": { "description": "Readiness probe, if configured kubernetes will not route traffic to this pod if failed consecutively. This could allow the service time to recover if it is being overwhelmed by traffic, but without the to ability to load balance or scale up/outwards, upstream services will need to know to back off. This is automatically disabled when in debug mode.", "type": "object", @@ -292,6 +301,15 @@ } } }, + "timeStampCron": { + "description": "If enabled, runs a daily CronJob that stamps blueapi scratch PVCs with a last-used annotation when mounted by a running pod", + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + }, "tolerations": { "description": "May be required to run on specific nodes (e.g. the control machine)", "type": "array" diff --git a/helm/blueapi/values.yaml b/helm/blueapi/values.yaml index 36a82eeb5..4892751f4 100644 --- a/helm/blueapi/values.yaml +++ b/helm/blueapi/values.yaml @@ -228,6 +228,15 @@ initContainer: # -- Size of persistent volume size: "1Gi" +# -- If enabled, runs a daily CronJob that stamps blueapi scratch PVCs with a last-used annotation when mounted by a running pod +timeStampCron: + enabled: true + +# -- If enabled, runs a weekly CronJob that deletes blueapi scratch PVCs unused for more than 3 months. +# To protect a PVC from deletion, set the annotation ""protected" to "true" on it. +pvcAutoDeletion: + enabled: true + debug: # -- If enabled, runs debugpy, allowing port-forwarding to expose port 5678 or attached vscode instance enabled: false