NVIDIA · derekwaynecarr · Jun 1, 2026 · May 27, 2026
@@ -138,6 +138,15 @@ kubectl -n openshell rollout status statefulset/openshell
 
 Look for failed installs, unexpected values, missing namespace, wrong image tag, TLS settings that do not match the registered endpoint, and scheduling failures.
 
+For HA or PostgreSQL-backed installs, also check the service-binding Secret and
+bundled PostgreSQL workload:
+
+```bash
+kubectl -n openshell get secret -l app.kubernetes.io/instance=openshell
+kubectl -n openshell get statefulset,pod,pvc -l app.kubernetes.io/instance=openshell
+kubectl -n openshell logs statefulset/openshell-postgres --tail=200
+```
+
 Check required Helm deployment secrets:
 
 ```bash

@@ -1,6 +1,6 @@
 ---
 name: helm-dev-environment
-description: Start up, tear down, and configure the local Kubernetes development environment for OpenShell. Uses k3d (Docker-backed k3s) + Skaffold + Helm. Covers cluster lifecycle, optional add-ons (Keycloak OIDC, Envoy Gateway), and port mappings. Trigger keywords - local k8s, local cluster, k3d, skaffold, helm dev, start cluster, stop cluster, tear down cluster, delete cluster, create cluster, helm:k3s, helm:skaffold, local dev environment, dev cluster, k8s dev, envoy gateway local, keycloak local.
+description: Start up, tear down, and configure the local Kubernetes development environment for OpenShell. Uses k3d (Docker-backed k3s) + Skaffold + Helm. Covers cluster lifecycle, optional add-ons (Keycloak OIDC, Envoy Gateway), HA testing, and port mappings. Trigger keywords - local k8s, local cluster, k3d, skaffold, helm dev, start cluster, stop cluster, tear down cluster, delete cluster, create cluster, helm:k3s, helm:skaffold, local dev environment, dev cluster, k8s dev, envoy gateway local, keycloak local, high availability, HA.
 ---
 
 # Helm Dev Environment
@@ -65,6 +65,10 @@ generates mTLS secrets on first install. Envoy Gateway opt-in; see the Optional
 
 The gateway Service uses ClusterIP. Access is via Envoy Gateway (port `8080`) or `kubectl port-forward`.
 
+**HA test deploy** (two gateway replicas + bundled PostgreSQL): uncomment
+`#- ci/values-high-availability.yaml` in `deploy/helm/openshell/skaffold.yaml`,
+then run `mise run helm:skaffold:run` or `mise run helm:skaffold:dev`.
+
 ### TLS behaviour
 
 `ci/values-skaffold.yaml` sets `server.disableTls: true`, so Skaffold-based deploys run
@@ -198,6 +202,7 @@ mise run helm:k3s:status
 | `deploy/helm/openshell/ci/values-skaffold.yaml` | Dev overrides (image pull policy, TLS disabled for local Skaffold) |
 | `deploy/helm/openshell/ci/values-cert-manager.yaml` | cert-manager PKI overlay (opt-in; disables pkiInitJob) |
 | `deploy/helm/openshell/ci/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay |
+| `deploy/helm/openshell/ci/values-high-availability.yaml` | HA test overlay (`replicaCount: 2` with bundled PostgreSQL) |
 | `deploy/helm/openshell/ci/values-keycloak.yaml` | Keycloak OIDC overlay |
 | `deploy/helm/openshell/ci/values-tls-disabled.yaml` | Lint-only: TLS + auth disabled (reverse-proxy edge termination) |
 | `deploy/kube/manifests/envoy-gateway-openshell.yaml` | GatewayClass for Envoy Gateway (`mise run helm:gateway:apply`) |

@@ -23,6 +23,7 @@ jobs:
       should_run: ${{ steps.gate.outputs.should_run }}
       run_core_e2e: ${{ steps.labels.outputs.run_core_e2e }}
       run_gpu_e2e: ${{ steps.labels.outputs.run_gpu_e2e }}
+      run_kubernetes_ha_e2e: ${{ steps.labels.outputs.run_kubernetes_ha_e2e }}
       run_any_e2e: ${{ steps.labels.outputs.run_any_e2e }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
@@ -39,24 +40,27 @@ jobs:
           if [ "$EVENT_NAME" != "push" ]; then
             run_core_e2e=true
             run_gpu_e2e=true
+            run_kubernetes_ha_e2e=true
           else
             run_core_e2e="$(jq -r 'index("test:e2e") != null' <<< "$LABELS_JSON")"
             run_gpu_e2e="$(jq -r 'index("test:e2e-gpu") != null' <<< "$LABELS_JSON")"
+            run_kubernetes_ha_e2e="$(jq -r 'index("test:e2e-kubernetes") != null' <<< "$LABELS_JSON")"
           fi
-          if [ "$run_core_e2e" = "true" ] || [ "$run_gpu_e2e" = "true" ]; then
+          if [ "$run_core_e2e" = "true" ] || [ "$run_gpu_e2e" = "true" ] || [ "$run_kubernetes_ha_e2e" = "true" ]; then
             run_any_e2e=true
           else
             run_any_e2e=false
           fi
           {
             echo "run_core_e2e=$run_core_e2e"
             echo "run_gpu_e2e=$run_gpu_e2e"
+            echo "run_kubernetes_ha_e2e=$run_kubernetes_ha_e2e"
             echo "run_any_e2e=$run_any_e2e"
           } >> "$GITHUB_OUTPUT"
 
   build-gateway:
     needs: [pr_metadata]
-    if: needs.pr_metadata.outputs.should_run == 'true' && needs.pr_metadata.outputs.run_core_e2e == 'true'
+    if: needs.pr_metadata.outputs.should_run == 'true' && (needs.pr_metadata.outputs.run_core_e2e == 'true' || needs.pr_metadata.outputs.run_kubernetes_ha_e2e == 'true')
     permissions:
       contents: read
       packages: write
@@ -107,6 +111,16 @@ jobs:
     with:
       image-tag: ${{ github.sha }}
 
+  kubernetes-ha-e2e:
+    needs: [pr_metadata, build-gateway, build-supervisor]
+    if: needs.pr_metadata.outputs.should_run == 'true' && needs.pr_metadata.outputs.run_kubernetes_ha_e2e == 'true'
+    permissions:
+      contents: read
+      packages: read
+    uses: ./.github/workflows/e2e-kubernetes-ha-test.yml
+    with:
+      image-tag: ${{ github.sha }}
+
   core-e2e-result:
     name: Core E2E result
     needs: [pr_metadata, build-gateway, build-supervisor, e2e, kubernetes-e2e]
@@ -160,3 +174,30 @@ jobs:
             fi
           done
           exit "$failed"
+
+  kubernetes-ha-e2e-result:
+    name: Kubernetes HA E2E result
+    needs: [pr_metadata, build-gateway, build-supervisor, kubernetes-ha-e2e]
+    if: always() && needs.pr_metadata.outputs.should_run == 'true' && needs.pr_metadata.outputs.run_kubernetes_ha_e2e == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Verify Kubernetes HA E2E jobs
+        env:
+          BUILD_GATEWAY_RESULT: ${{ needs.build-gateway.result }}
+          BUILD_SUPERVISOR_RESULT: ${{ needs.build-supervisor.result }}
+          KUBERNETES_HA_E2E_RESULT: ${{ needs.kubernetes-ha-e2e.result }}
+        run: |
+          set -euo pipefail
+          failed=0
+          for item in \
+            "build-gateway:$BUILD_GATEWAY_RESULT" \
+            "build-supervisor:$BUILD_SUPERVISOR_RESULT" \
+            "kubernetes-ha-e2e:$KUBERNETES_HA_E2E_RESULT"; do
+            name="${item%%:*}"
+            result="${item#*:}"
+            if [ "$result" != "success" ]; then
+              echo "::error::$name concluded $result"
+              failed=1
+            fi
+          done
+          exit "$failed"
@@ -0,0 +1,37 @@
+name: Kubernetes HA E2E Test
+
+on:
+  workflow_call:
+    inputs:
+      image-tag:
+        description: "Image tag to test (typically the commit SHA)"
+        required: true
+        type: string
+      runner:
+        description: "GitHub Actions runner label"
+        required: false
+        type: string
+        default: "linux-amd64-cpu8"
+      checkout-ref:
+        description: "Git ref to check out for test inputs (defaults to the workflow SHA)"
+        required: false
+        type: string
+        default: ""
+
+permissions:
+  contents: read
+  packages: read
+
+jobs:
+  e2e-kubernetes-ha:
+    name: Kubernetes HA E2E
+    permissions:
+      contents: read
+      packages: read
+    uses: ./.github/workflows/e2e-kubernetes-test.yml
+    secrets: inherit
+    with:
+      image-tag: ${{ inputs.image-tag }}
+      runner: ${{ inputs.runner }}
+      checkout-ref: ${{ inputs.checkout-ref }}
+      extra-helm-values: deploy/helm/openshell/ci/values-high-availability.yaml
@@ -17,6 +17,11 @@ on:
         required: false
         type: string
         default: ""
+      extra-helm-values:
+        description: "Colon-separated Helm values files to layer on the Kubernetes e2e chart install"
+        required: false
+        type: string
+        default: ""
 
 permissions:
   contents: read
@@ -93,6 +98,7 @@ jobs:
       - name: Run Kubernetes E2E (Rust smoke)
         env:
           OPENSHELL_E2E_KUBE_CONTEXT: kind-${{ env.KIND_CLUSTER_NAME }}
+          OPENSHELL_E2E_KUBE_EXTRA_VALUES: ${{ inputs.extra-helm-values }}
           IMAGE_TAG: ${{ inputs.image-tag }}
           OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell
         run: mise run --no-deps --skip-deps e2e:kubernetes
@@ -19,7 +19,7 @@ permissions: {}
 jobs:
   hint:
     name: Post next-step hint for E2E label
-    if: github.event.label.name == 'test:e2e' || github.event.label.name == 'test:e2e-gpu'
+    if: github.event.label.name == 'test:e2e' || github.event.label.name == 'test:e2e-gpu' || github.event.label.name == 'test:e2e-kubernetes'
     runs-on: ubuntu-latest
     permissions:
       pull-requests: write
@@ -43,10 +43,17 @@ jobs:
             test:e2e)
               suite_summary="the standard E2E suite"
               build_summary="gateway and supervisor images"
+              status_summary="The matching required CI gate status on this PR will flip green automatically once the run finishes."
               ;;
             test:e2e-gpu)
               suite_summary="GPU E2E"
               build_summary="supervisor image"
+              status_summary="The matching required CI gate status on this PR will flip green automatically once the run finishes."
+              ;;
+            test:e2e-kubernetes)
+              suite_summary="Kubernetes HA E2E"
+              build_summary="gateway and supervisor images"
+              status_summary="This is an optional proof-of-life suite; failures are visible in the workflow run but do not publish a required CI gate status."
               ;;
             *) echo "Unrecognized label $LABEL_NAME"; exit 1 ;;
           esac
@@ -69,7 +76,7 @@ jobs:
               workflow_link="[$workflow_name](https://github.com/$GH_REPO/actions/workflows/$workflow_file)"
               instructions="Open $workflow_link, find the run for commit \`$short_pr\`, and click **Re-run all jobs** to execute with the label set."
             fi
-            body="Label \`$LABEL_NAME\` applied for \`$short_pr\`. $instructions The run will execute $suite_summary after building the required $build_summary once. The matching required CI gate status on this PR will flip green automatically once the run finishes."
+            body="Label \`$LABEL_NAME\` applied for \`$short_pr\`. $instructions The run will execute $suite_summary after building the required $build_summary once. $status_summary"
           fi
 
           gh pr comment "$PR_NUMBER" --body "$body"
@@ -10,13 +10,15 @@ PR CI that runs on NVIDIA self-hosted runners uses NVIDIA's copy-pr-bot. The bot
 
 `Branch Checks` run automatically after copy-pr-bot mirrors the PR. `Required CI Gates` posts PR-head statuses that verify the mirror exists, is current, and ran the expected push-based workflows. E2E suites are opt-in because they are more expensive and publish temporary images.
 
-Two opt-in labels enable the long-running E2E suites:
+Three opt-in labels enable the long-running E2E suites:
 
 - `test:e2e` runs the standard E2E suite in `Branch E2E Checks`
 - `test:e2e-gpu` runs GPU E2E in `Branch E2E Checks`
+- `test:e2e-kubernetes` runs Kubernetes E2E with the HA Helm overlay
+  (`replicaCount: 2` and bundled PostgreSQL) in `Branch E2E Checks`
 
-When both labels are present, `Branch E2E Checks` builds the shared gateway and supervisor images once and fans out all enabled suites in parallel.
-The `OpenShell / E2E` and `OpenShell / GPU E2E` required statuses are evaluated from separate suite result jobs inside that workflow, so the expensive GPU suite stays independently gated.
+When multiple labels are present, `Branch E2E Checks` builds the shared gateway and supervisor images once and fans out all enabled suites in parallel.
+The `OpenShell / E2E` and `OpenShell / GPU E2E` required statuses are evaluated from separate suite result jobs inside that workflow. `test:e2e-kubernetes` is optional while HA behavior is under active iteration: failures are visible in the workflow run but do not publish a required CI gate status.
 
 The GitHub ruleset should require the `OpenShell / ...` statuses published by `Required CI Gates`, not the push-triggered workflow jobs directly.
 
@@ -69,7 +71,7 @@ Flow:
 
 1. Open the PR. copy-pr-bot mirrors it to `pull-request/<N>` automatically.
 2. The mirror push runs `Branch Checks` automatically. `Required CI Gates` keeps the PR blocked until the mirror exists, matches the PR head SHA, and the required push-based workflow succeeds. The first `Branch E2E Checks` run only resolves metadata and skips expensive jobs unless an E2E label is already set.
-3. A maintainer applies `test:e2e` and/or `test:e2e-gpu`. `E2E Label Help` posts a comment with a link to the existing gated workflow run.
+3. A maintainer applies `test:e2e`, `test:e2e-gpu`, and/or `test:e2e-kubernetes`. `E2E Label Help` posts a comment with a link to the existing gated workflow run.
 4. The maintainer opens that link and clicks **Re-run all jobs**. This time `pr_metadata` sees the label and the build/E2E jobs run.
 5. When the run finishes, the matching `OpenShell / ...` gate status flips to green automatically.
 6. New commits push to the mirror automatically and re-trigger `Branch Checks` plus any labeled E2E jobs in `Branch E2E Checks`.
@@ -108,7 +110,7 @@ The bot's full administrator documentation is internal to NVIDIA. The only comma
 | File | Role |
 |---|---|
 | `.github/workflows/branch-checks.yml` | Required non-E2E PR checks. Triggers on `push: pull-request/[0-9]+`. |
-| `.github/workflows/branch-e2e.yml` | Opt-in standard and GPU E2E. Triggers on `push: pull-request/[0-9]+` and runs jobs selected by `test:e2e` / `test:e2e-gpu`. |
+| `.github/workflows/branch-e2e.yml` | Opt-in standard, GPU, and Kubernetes HA E2E. Triggers on `push: pull-request/[0-9]+` and runs jobs selected by `test:e2e`, `test:e2e-gpu`, or `test:e2e-kubernetes`. |
 | `.github/workflows/helm-lint.yml` | Helm chart validation. Triggers on `push: pull-request/[0-9]+` and skips lint jobs unless Helm inputs changed. |
 | `.github/actions/pr-gate/action.yml` | Composite action that resolves PR metadata and verifies the required label is set. |
 | `.github/actions/pr-merge-base/action.yml` | Composite action that resolves and fetches the merge-base commit for `pull-request/<N>` push workflows. |

@@ -302,4 +302,4 @@ DCO sign-off is separate from cryptographic commit signing. CI requires signing
 
 ## CI
 
-How PR CI runs, the `test:e2e` / `test:e2e-gpu` labels, copy-pr-bot, and commit-signing setup are documented in [CI.md](CI.md).
+How PR CI runs, the `test:e2e`, `test:e2e-gpu`, and `test:e2e-kubernetes` labels, copy-pr-bot, and commit-signing setup are documented in [CI.md](CI.md).
@@ -56,6 +56,7 @@ See [`values.yaml`](values.yaml) for source defaults. Selected overlays:
 - [`ci/values-gateway.yaml`](ci/values-gateway.yaml) - gateway-only configuration
 - [`ci/values-cert-manager.yaml`](ci/values-cert-manager.yaml) - cert-manager integration
 - [`ci/values-keycloak.yaml`](ci/values-keycloak.yaml) - Keycloak OIDC integration
+- [`ci/values-high-availability.yaml`](ci/values-high-availability.yaml) - HA gateway test overlay with bundled PostgreSQL
 
 ### Database backend
 

@@ -56,6 +56,7 @@ See [`values.yaml`](values.yaml) for source defaults. Selected overlays:
 - [`ci/values-gateway.yaml`](ci/values-gateway.yaml) - gateway-only configuration
 - [`ci/values-cert-manager.yaml`](ci/values-cert-manager.yaml) - cert-manager integration
 - [`ci/values-keycloak.yaml`](ci/values-keycloak.yaml) - Keycloak OIDC integration
+- [`ci/values-high-availability.yaml`](ci/values-high-availability.yaml) - HA gateway test overlay with bundled PostgreSQL
 
 ### Database backend
 

@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# CI/dev overlay for exercising the gateway with more than one replica.
+# SQLite is not suitable for HA because each replica has its own pod volume, so
+# this overlay enables the bundled PostgreSQL dependency added by the chart.
+replicaCount: 2
+
+postgres:
+  enabled: true
+  auth:
+    password: openshell-ha-ci
@@ -95,6 +95,8 @@ deploy:
           #- ci/values-keycloak.yaml
           # To enable the Gateway API HTTPRoute (requires Envoy Gateway above):
           #- ci/values-gateway.yaml
+          # To test HA gateway behavior with bundled PostgreSQL:
+          #- ci/values-high-availability.yaml
         setValueTemplates:
           image.repository: '{{.IMAGE_REPO_openshell_gateway}}'
           image.tag: '{{.IMAGE_TAG_openshell_gateway}}'

@@ -16,6 +16,10 @@
 # Helm e2e currently uses plaintext gateway traffic (ci/values-skaffold.yaml).
 # The certgen hook still runs so the gateway has sandbox JWT signing keys.
 #
+# Set OPENSHELL_E2E_KUBE_EXTRA_VALUES to one or more colon-separated Helm values
+# files, relative to the repository root or absolute, to layer additional chart
+# configuration on top of ci/values-skaffold.yaml.
+#
 # Image source:
 #   - Ephemeral k3d mode builds local `openshell/{gateway,supervisor}:${IMAGE_TAG}`
 #     images by default, imports them into k3d, then installs the chart. This
@@ -241,7 +245,7 @@ run_scenario() {
 
   helmctl install "${RELEASE_NAME}" "${ROOT}/deploy/helm/openshell" \
     --namespace "${NAMESPACE}" --create-namespace \
-    --values "${ROOT}/deploy/helm/openshell/ci/values-skaffold.yaml" \
+    "${helm_values_args[@]}" \
     --set "fullnameOverride=openshell" \
     --set "image.repository=${REGISTRY_VALUE}/gateway" \
     --set "image.tag=${IMAGE_TAG_VALUE}" \
@@ -535,6 +539,20 @@ if [ -n "${HOST_GATEWAY_IP}" ]; then
   helm_extra_args+=(--set "server.hostGatewayIP=${HOST_GATEWAY_IP}")
 fi
 
+helm_values_args=(--values "${ROOT}/deploy/helm/openshell/ci/values-skaffold.yaml")
+helm_extra_values_enabled=0
+if [ -n "${OPENSHELL_E2E_KUBE_EXTRA_VALUES:-}" ]; then
+  IFS=':' read -r -a extra_values_files <<< "${OPENSHELL_E2E_KUBE_EXTRA_VALUES}"
+  for values_file in "${extra_values_files[@]}"; do
+    [ -n "${values_file}" ] || continue
+    if [[ "${values_file}" != /* ]]; then
+      values_file="${ROOT}/${values_file}"
+    fi
+    helm_values_args+=(--values "${values_file}")
+    helm_extra_values_enabled=1
+  done
+fi
+
 if [ "${OPENSHELL_E2E_KUBE_DB_SCENARIOS:-0}" = "1" ]; then
   helm dependency build "${ROOT}/deploy/helm/openshell"
 
@@ -573,11 +591,18 @@ if [ "${OPENSHELL_E2E_KUBE_DB_SCENARIOS:-0}" = "1" ]; then
   fi
 else
   # --- Single-install mode (default, existing behavior) ---
-  chart_dir="$(chart_without_dependencies)"
+  helm_dependency_args=()
+  if [ "${helm_extra_values_enabled}" = "1" ]; then
+    chart_dir="${ROOT}/deploy/helm/openshell"
+    helm_dependency_args=(--dependency-update)
+  else
+    chart_dir="$(chart_without_dependencies)"
+  fi
   echo "Installing Helm chart (release=${RELEASE_NAME}, namespace=${NAMESPACE}, tag=${IMAGE_TAG_VALUE})..."
   helmctl install "${RELEASE_NAME}" "${chart_dir}" \
     --namespace "${NAMESPACE}" --create-namespace \
-    --values "${ROOT}/deploy/helm/openshell/ci/values-skaffold.yaml" \
+    "${helm_dependency_args[@]}" \
+    "${helm_values_args[@]}" \
     --set "fullnameOverride=openshell" \
     --set "image.repository=${REGISTRY_VALUE}/gateway" \
     --set "image.tag=${IMAGE_TAG_VALUE}" \
Original file line number	Diff line number	Diff line change
Expand Up		@@ -302,4 +302,4 @@ DCO sign-off is separate from cryptographic commit signing. CI requires signing

		## CI

		How PR CI runs, the `test:e2e` / `test:e2e-gpu` labels, copy-pr-bot, and commit-signing setup are documented in [CI.md](CI.md).
		How PR CI runs, the `test:e2e`, `test:e2e-gpu`, and `test:e2e-kubernetes` labels, copy-pr-bot, and commit-signing setup are documented in [CI.md](CI.md).