From d4fb05eb6568ba4ef8e528c3fff6d6cd5db88bdc Mon Sep 17 00:00:00 2001 From: Dimosthenis Schizas Date: Tue, 21 Apr 2026 20:53:24 +0300 Subject: [PATCH 1/2] fix: liveness probe tcpSocket default to prevent Raft catch-up restart loops Typesense returns 503 from /health when queued_writes exceeds --healthy-write-lag (default 500). This threshold is always exceeded during Raft log catch-up after a pod restart or new node join, causing liveness probes to kill pods before they finish recovering and breaking quorum in a cascading loop. Changes: - Default livenessProbe switched to tcpSocket (checks process alive only) - Probe type is now controlled via a .type field (httpGet or tcpSocket) rendered by a new typesense.probe helper, avoiding Helm map-merge from producing both handlers simultaneously when users override the type - startupProbe and readinessProbe remain on httpGet /health (correct) - Default CPU request set to 2000m per Typesense minimum requirements - Sane probe thresholds: liveness failureThreshold 6/20s, startup 60/10s, readiness 12/10s Closes #4 --- Chart.yaml | 14 +++++++++---- templates/_helpers.tpl | 25 ++++++++++++++++++++++ templates/statefulset.yaml | 6 +++--- tests/statefulset_test.yaml | 16 ++++++++------- values.yaml | 41 ++++++++++++++++++++++++++++--------- 5 files changed, 78 insertions(+), 24 deletions(-) diff --git a/Chart.yaml b/Chart.yaml index e5c1d32..ec0a941 100644 --- a/Chart.yaml +++ b/Chart.yaml @@ -1,9 +1,11 @@ apiVersion: v2 name: typesense description: >- - Deploy Typesense search engine on Kubernetes with Raft-based HA clustering, Prometheus metrics, and Gateway API support. This chart is not officially maintained by or affiliated with the Typesense project. + Deploy Typesense search engine on Kubernetes with Raft-based HA clustering, + Prometheus metrics, and Gateway API support. This chart is not officially + maintained by or affiliated with the Typesense project. type: application -version: 1.1.0 +version: 1.1.1 appVersion: "30.1" icon: https://typesense.org/typesense-logo.svg home: https://github.com/hackthebox/typesense-helm @@ -36,5 +38,9 @@ annotations: - name: metrics-exporter image: imatefx/typesense-prometheus-exporter:v0.1.5 artifacthub.io/changes: | - - kind: added - description: Initial open-source release + - kind: fixed + description: "liveness probe default changed to tcpSocket to prevent restart loops during Raft catch-up" + - kind: changed + description: "probe type is now controlled via .type field (httpGet or tcpSocket) to avoid Helm merge producing multiple handlers" + - kind: changed + description: "default CPU request set to 2000m per Typesense minimum requirements" diff --git a/templates/_helpers.tpl b/templates/_helpers.tpl index a58b390..850373f 100644 --- a/templates/_helpers.tpl +++ b/templates/_helpers.tpl @@ -64,6 +64,31 @@ Create the name of the service account to use {{- end }} {{- end }} +{{/* +Render a probe with a single handler selected by .type (httpGet or tcpSocket). +This avoids Helm's map-merge producing both handlers simultaneously when users +override just the type in their values, which Kubernetes rejects. +*/}} +{{- define "typesense.probe" -}} +{{- $p := . -}} +{{- if eq $p.type "tcpSocket" }} +tcpSocket: {{- toYaml $p.tcpSocket | nindent 2 }} +{{- else }} +httpGet: {{- toYaml $p.httpGet | nindent 2 }} +{{- end }} +failureThreshold: {{ $p.failureThreshold }} +periodSeconds: {{ $p.periodSeconds }} +{{- with $p.timeoutSeconds }} +timeoutSeconds: {{ . }} +{{- end }} +{{- with $p.successThreshold }} +successThreshold: {{ . }} +{{- end }} +{{- with $p.initialDelaySeconds }} +initialDelaySeconds: {{ . }} +{{- end }} +{{- end }} + {{/* Create the nodeslist */}} diff --git a/templates/statefulset.yaml b/templates/statefulset.yaml index 8ef0461..3a25dbb 100644 --- a/templates/statefulset.yaml +++ b/templates/statefulset.yaml @@ -118,9 +118,9 @@ spec: containerPort: {{ .Values.service.port }} - name: tcp-peering containerPort: 8107 - startupProbe: {{- toYaml .Values.startupProbe | nindent 12 }} - livenessProbe: {{- toYaml .Values.livenessProbe | nindent 12 }} - readinessProbe: {{- toYaml .Values.readinessProbe | nindent 12 }} + startupProbe: {{- include "typesense.probe" .Values.startupProbe | nindent 12 }} + livenessProbe: {{- include "typesense.probe" .Values.livenessProbe | nindent 12 }} + readinessProbe: {{- include "typesense.probe" .Values.readinessProbe | nindent 12 }} resources: {{- toYaml .Values.resources | nindent 12 }} volumeMounts: - name: nodeslist diff --git a/tests/statefulset_test.yaml b/tests/statefulset_test.yaml index 685cb18..855a242 100644 --- a/tests/statefulset_test.yaml +++ b/tests/statefulset_test.yaml @@ -55,19 +55,20 @@ tests: - equal: path: .spec.template.spec.containers[0].startupProbe value: - failureThreshold: 10 + failureThreshold: 60 httpGet: path: /health port: http periodSeconds: 10 + timeoutSeconds: 3 - equal: path: .spec.template.spec.containers[0].livenessProbe value: - failureThreshold: 2 - httpGet: - path: /health + failureThreshold: 6 + tcpSocket: port: http - periodSeconds: 10 + periodSeconds: 20 + timeoutSeconds: 3 - notExists: path: .spec.volumeClaimTemplates[0].spec.storageClassName - equal: @@ -287,11 +288,12 @@ tests: - equal: path: .spec.template.spec.containers[0].readinessProbe value: - failureThreshold: 3 + failureThreshold: 12 httpGet: path: /health port: http - periodSeconds: 5 + periodSeconds: 10 + timeoutSeconds: 3 - it: should render default anti-affinity but not tolerations or topologySpreadConstraints when empty asserts: - notFailedTemplate: {} diff --git a/values.yaml b/values.yaml index efc671c..d249767 100644 --- a/values.yaml +++ b/values.yaml @@ -119,38 +119,59 @@ gateway: namespace: "istio-system" extras: [] -# -- Resource requests and limits for the Typesense container -resources: {} +# -- Resource requests and limits for the Typesense container. +# Typesense requires at least 2 vCPUs to operate correctly. No CPU limit is +# set by default to avoid throttling during indexing and Raft catch-up. +resources: + requests: + cpu: 2000m livenessProbe: - # -- HTTP GET path and port to check Typesense health for liveness + # -- Probe type: 'tcpSocket' or 'httpGet'. + # Default is tcpSocket to check only that the process is alive and listening. + # Avoid httpGet /health for liveness: Typesense returns 503 when the write + # queue exceeds --healthy-write-lag (default 500), which occurs normally + # during Raft catch-up after a restart, causing a liveness-triggered restart + # loop that prevents the cluster from ever recovering. + type: tcpSocket httpGet: path: /health port: http + tcpSocket: + port: http # -- Number of failed liveness checks before restarting the container - failureThreshold: 2 + failureThreshold: 6 # -- Period (in seconds) to perform the liveness check - periodSeconds: 10 + periodSeconds: 20 + timeoutSeconds: 3 startupProbe: - # -- HTTP GET path and port to check Typesense health for startup + # -- Probe type: 'httpGet' or 'tcpSocket' + type: httpGet httpGet: path: /health port: http + tcpSocket: + port: http # -- Number of failed startup checks before marking the container as unhealthy - failureThreshold: 10 + failureThreshold: 60 # -- Period (in seconds) to perform the startup check periodSeconds: 10 + timeoutSeconds: 3 readinessProbe: - # -- HTTP GET path and port to check Typesense readiness + # -- Probe type: 'httpGet' or 'tcpSocket' + type: httpGet httpGet: path: /health port: http + tcpSocket: + port: http # -- Period (in seconds) to perform the readiness check - periodSeconds: 5 + periodSeconds: 10 # -- Number of failed readiness checks before marking the pod as unready - failureThreshold: 3 + failureThreshold: 12 + timeoutSeconds: 3 # -- Node selector to schedule pods on specific nodes (optional) nodeSelector: {} From 6d3382d0d0ff02ef04e1903dc56f991c3090bc8f Mon Sep 17 00:00:00 2001 From: Dimosthenis Schizas Date: Tue, 21 Apr 2026 20:55:18 +0300 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=93=9D=20update=20README=20and=20valu?= =?UTF-8?q?es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 32 ++++++++++++++++++++++---------- values.md | 32 ++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 4e061ba..b500df8 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # typesense -![Version: 1.1.0](https://img.shields.io/badge/Version-1.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 30.1](https://img.shields.io/badge/AppVersion-30.1-informational?style=flat-square) +![Version: 1.1.1](https://img.shields.io/badge/Version-1.1.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 30.1](https://img.shields.io/badge/AppVersion-30.1-informational?style=flat-square) [![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/typesense)](https://artifacthub.io/packages/search?repo=typesense) @@ -233,9 +233,13 @@ storage: | ingress.enabled | bool | `false` | Enable or disable Ingress for the application | | ingress.hosts | list | `[]` | List of hostnames the Ingress will route traffic for | | ingress.prefix | string | `"/"` | The URL path prefix for the application | -| livenessProbe.failureThreshold | int | `2` | Number of failed liveness checks before restarting the container | -| livenessProbe.httpGet | object | `{"path":"/health","port":"http"}` | HTTP GET path and port to check Typesense health for liveness | -| livenessProbe.periodSeconds | int | `10` | Period (in seconds) to perform the liveness check | +| livenessProbe.failureThreshold | int | `6` | Number of failed liveness checks before restarting the container | +| livenessProbe.httpGet.path | string | `"/health"` | | +| livenessProbe.httpGet.port | string | `"http"` | | +| livenessProbe.periodSeconds | int | `20` | Period (in seconds) to perform the liveness check | +| livenessProbe.tcpSocket.port | string | `"http"` | | +| livenessProbe.timeoutSeconds | int | `3` | | +| livenessProbe.type | string | `"tcpSocket"` | Probe type: 'tcpSocket' or 'httpGet'. Default is tcpSocket to check only that the process is alive and listening. Avoid httpGet /health for liveness: Typesense returns 503 when the write queue exceeds --healthy-write-lag (default 500), which occurs normally during Raft catch-up after a restart, causing a liveness-triggered restart loop that prevents the cluster from ever recovering. | | metrics.enabled | bool | `false` | Enable Prometheus metrics sidecar | | metrics.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy for metrics exporter | | metrics.image.repository | string | `"imatefx/typesense-prometheus-exporter"` | Metrics exporter image repository | @@ -255,11 +259,15 @@ storage: | podSecurityContext.runAsGroup | int | `3000` | Group ID for running the Typesense process | | podSecurityContext.runAsNonRoot | bool | `true` | Ensure the container does not run as root | | podSecurityContext.runAsUser | int | `10000` | User ID for running the Typesense process | -| readinessProbe.failureThreshold | int | `3` | Number of failed readiness checks before marking the pod as unready | -| readinessProbe.httpGet | object | `{"path":"/health","port":"http"}` | HTTP GET path and port to check Typesense readiness | -| readinessProbe.periodSeconds | int | `5` | Period (in seconds) to perform the readiness check | +| readinessProbe.failureThreshold | int | `12` | Number of failed readiness checks before marking the pod as unready | +| readinessProbe.httpGet.path | string | `"/health"` | | +| readinessProbe.httpGet.port | string | `"http"` | | +| readinessProbe.periodSeconds | int | `10` | Period (in seconds) to perform the readiness check | +| readinessProbe.tcpSocket.port | string | `"http"` | | +| readinessProbe.timeoutSeconds | int | `3` | | +| readinessProbe.type | string | `"httpGet"` | Probe type: 'httpGet' or 'tcpSocket' | | replicaCount | int | `3` | Number of replicas for the Typesense deployment | -| resources | object | `{}` | Resource requests and limits for the Typesense container | +| resources | object | `{"requests":{"cpu":"2000m"}}` | Resource requests and limits for the Typesense container. Typesense requires at least 2 vCPUs to operate correctly. No CPU limit is set by default to avoid throttling during indexing and Raft catch-up. | | secrets.externalSecret.enabled | bool | `false` | Enable or disable ExternalSecret creation (requires external-secrets operator) | | secrets.externalSecret.extractKey | string | `""` | The key path to extract secrets from | | secrets.externalSecret.storeName | string | `""` | The name of the ClusterSecretStore or SecretStore to use | @@ -274,9 +282,13 @@ storage: | serviceAccount.automountServiceAccountToken | bool | `false` | Whether to automount the ServiceAccount token | | serviceAccount.create | bool | `true` | Whether to create a ServiceAccount | | serviceAccount.name | string | `""` | Name of the ServiceAccount. Defaults to fullname | -| startupProbe.failureThreshold | int | `10` | Number of failed startup checks before marking the container as unhealthy | -| startupProbe.httpGet | object | `{"path":"/health","port":"http"}` | HTTP GET path and port to check Typesense health for startup | +| startupProbe.failureThreshold | int | `60` | Number of failed startup checks before marking the container as unhealthy | +| startupProbe.httpGet.path | string | `"/health"` | | +| startupProbe.httpGet.port | string | `"http"` | | | startupProbe.periodSeconds | int | `10` | Period (in seconds) to perform the startup check | +| startupProbe.tcpSocket.port | string | `"http"` | | +| startupProbe.timeoutSeconds | int | `3` | | +| startupProbe.type | string | `"httpGet"` | Probe type: 'httpGet' or 'tcpSocket' | | storage.className | string | `nil` | Storage class to use for Persistent Volume Claims (PVC) | | storage.size | string | `"10Gi"` | Size of the persistent storage volume (e.g., 10Gi) | | terminationGracePeriodSeconds | int | `300` | Termination grace period in seconds. Typesense recommends 300s to allow graceful shutdown. | diff --git a/values.md b/values.md index 4e061ba..b500df8 100644 --- a/values.md +++ b/values.md @@ -1,6 +1,6 @@ # typesense -![Version: 1.1.0](https://img.shields.io/badge/Version-1.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 30.1](https://img.shields.io/badge/AppVersion-30.1-informational?style=flat-square) +![Version: 1.1.1](https://img.shields.io/badge/Version-1.1.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 30.1](https://img.shields.io/badge/AppVersion-30.1-informational?style=flat-square) [![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/typesense)](https://artifacthub.io/packages/search?repo=typesense) @@ -233,9 +233,13 @@ storage: | ingress.enabled | bool | `false` | Enable or disable Ingress for the application | | ingress.hosts | list | `[]` | List of hostnames the Ingress will route traffic for | | ingress.prefix | string | `"/"` | The URL path prefix for the application | -| livenessProbe.failureThreshold | int | `2` | Number of failed liveness checks before restarting the container | -| livenessProbe.httpGet | object | `{"path":"/health","port":"http"}` | HTTP GET path and port to check Typesense health for liveness | -| livenessProbe.periodSeconds | int | `10` | Period (in seconds) to perform the liveness check | +| livenessProbe.failureThreshold | int | `6` | Number of failed liveness checks before restarting the container | +| livenessProbe.httpGet.path | string | `"/health"` | | +| livenessProbe.httpGet.port | string | `"http"` | | +| livenessProbe.periodSeconds | int | `20` | Period (in seconds) to perform the liveness check | +| livenessProbe.tcpSocket.port | string | `"http"` | | +| livenessProbe.timeoutSeconds | int | `3` | | +| livenessProbe.type | string | `"tcpSocket"` | Probe type: 'tcpSocket' or 'httpGet'. Default is tcpSocket to check only that the process is alive and listening. Avoid httpGet /health for liveness: Typesense returns 503 when the write queue exceeds --healthy-write-lag (default 500), which occurs normally during Raft catch-up after a restart, causing a liveness-triggered restart loop that prevents the cluster from ever recovering. | | metrics.enabled | bool | `false` | Enable Prometheus metrics sidecar | | metrics.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy for metrics exporter | | metrics.image.repository | string | `"imatefx/typesense-prometheus-exporter"` | Metrics exporter image repository | @@ -255,11 +259,15 @@ storage: | podSecurityContext.runAsGroup | int | `3000` | Group ID for running the Typesense process | | podSecurityContext.runAsNonRoot | bool | `true` | Ensure the container does not run as root | | podSecurityContext.runAsUser | int | `10000` | User ID for running the Typesense process | -| readinessProbe.failureThreshold | int | `3` | Number of failed readiness checks before marking the pod as unready | -| readinessProbe.httpGet | object | `{"path":"/health","port":"http"}` | HTTP GET path and port to check Typesense readiness | -| readinessProbe.periodSeconds | int | `5` | Period (in seconds) to perform the readiness check | +| readinessProbe.failureThreshold | int | `12` | Number of failed readiness checks before marking the pod as unready | +| readinessProbe.httpGet.path | string | `"/health"` | | +| readinessProbe.httpGet.port | string | `"http"` | | +| readinessProbe.periodSeconds | int | `10` | Period (in seconds) to perform the readiness check | +| readinessProbe.tcpSocket.port | string | `"http"` | | +| readinessProbe.timeoutSeconds | int | `3` | | +| readinessProbe.type | string | `"httpGet"` | Probe type: 'httpGet' or 'tcpSocket' | | replicaCount | int | `3` | Number of replicas for the Typesense deployment | -| resources | object | `{}` | Resource requests and limits for the Typesense container | +| resources | object | `{"requests":{"cpu":"2000m"}}` | Resource requests and limits for the Typesense container. Typesense requires at least 2 vCPUs to operate correctly. No CPU limit is set by default to avoid throttling during indexing and Raft catch-up. | | secrets.externalSecret.enabled | bool | `false` | Enable or disable ExternalSecret creation (requires external-secrets operator) | | secrets.externalSecret.extractKey | string | `""` | The key path to extract secrets from | | secrets.externalSecret.storeName | string | `""` | The name of the ClusterSecretStore or SecretStore to use | @@ -274,9 +282,13 @@ storage: | serviceAccount.automountServiceAccountToken | bool | `false` | Whether to automount the ServiceAccount token | | serviceAccount.create | bool | `true` | Whether to create a ServiceAccount | | serviceAccount.name | string | `""` | Name of the ServiceAccount. Defaults to fullname | -| startupProbe.failureThreshold | int | `10` | Number of failed startup checks before marking the container as unhealthy | -| startupProbe.httpGet | object | `{"path":"/health","port":"http"}` | HTTP GET path and port to check Typesense health for startup | +| startupProbe.failureThreshold | int | `60` | Number of failed startup checks before marking the container as unhealthy | +| startupProbe.httpGet.path | string | `"/health"` | | +| startupProbe.httpGet.port | string | `"http"` | | | startupProbe.periodSeconds | int | `10` | Period (in seconds) to perform the startup check | +| startupProbe.tcpSocket.port | string | `"http"` | | +| startupProbe.timeoutSeconds | int | `3` | | +| startupProbe.type | string | `"httpGet"` | Probe type: 'httpGet' or 'tcpSocket' | | storage.className | string | `nil` | Storage class to use for Persistent Volume Claims (PVC) | | storage.size | string | `"10Gi"` | Size of the persistent storage volume (e.g., 10Gi) | | terminationGracePeriodSeconds | int | `300` | Termination grace period in seconds. Typesense recommends 300s to allow graceful shutdown. |