Add runbook for ProjectStuckCreatingSLOViolation alert #1048
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test Environment Setup and Validation | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| test_suite: | |
| description: 'Test suite to run (e.g., quota, group, or empty for all)' | |
| required: false | |
| default: '' | |
| type: string | |
| pull_request: {} | |
| env: | |
| # Enable experimental remote taskfiles feature | |
| TASK_X_REMOTE_TASKFILES: 1 | |
| # Test infrastructure configuration | |
| TEST_INFRA_CLUSTER_NAME: test-infra | |
| MILO_IMAGE_NAME: ghcr.io/datum-cloud/milo | |
| MILO_IMAGE_TAG: dev | |
| jobs: | |
| test-environment-validation: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version-file: 'go.mod' | |
| cache: true | |
| - name: Install Task CLI | |
| run: | | |
| sh -c "$(curl --location https://taskfile.dev/install.sh)" -- -d -b /usr/local/bin | |
| - name: Verify Task installation | |
| run: | | |
| task --version | |
| echo "Available tasks:" | |
| task --list | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| with: | |
| buildkitd-config-inline: | | |
| [worker.oci] | |
| max-parallelism = 4 | |
| - name: Install kubectl | |
| uses: azure/setup-kubectl@v4 | |
| with: | |
| version: 'v1.30.0' | |
| - name: Install KinD | |
| uses: helm/kind-action@v1 | |
| with: | |
| install_only: true | |
| version: v0.24.0 | |
| - name: Verify prerequisites | |
| run: | | |
| echo "=== Checking prerequisites ===" | |
| docker version | |
| kubectl version --client | |
| kind version | |
| echo "Go version: $(go version)" | |
| - name: Build Milo container image with caching | |
| uses: docker/build-push-action@v5 | |
| with: | |
| context: . | |
| push: false | |
| load: true # Load the image into Docker daemon for KinD | |
| tags: ${{ env.MILO_IMAGE_NAME }}:${{ env.MILO_IMAGE_TAG }} | |
| cache-from: type=gha | |
| cache-to: type=gha,mode=max | |
| platforms: linux/amd64 | |
| build-args: | | |
| GOPROXY=https://proxy.golang.org,direct | |
| - name: Set up test infrastructure cluster | |
| run: | | |
| echo "Setting up test infrastructure cluster (using pre-built image)..." | |
| # Create the KinD cluster | |
| task test-infra:cluster-up | |
| # Generate required code | |
| task generate:code | |
| # Load the pre-built image (will skip build since image exists) | |
| task dev:load | |
| # Deploy Milo control plane (dependencies already satisfied) | |
| task dev:deploy | |
| - name: Wait for networking path to be ready | |
| run: | | |
| echo "=== Waiting for networking path to be ready ===" | |
| # Quick verification that KinD cluster is running | |
| docker ps --filter "name=test-infra" | |
| # Verify all Milo components are running | |
| echo "Checking Milo control plane components:" | |
| task test-infra:kubectl -- get pods -n milo-system | |
| # Wait for Envoy Gateway to be ready (required for API server access) | |
| echo "⏳ Waiting for Envoy Gateway to be ready..." | |
| task test-infra:kubectl -- wait --for=condition=Ready pod -l app.kubernetes.io/name=envoy-gateway -n envoy-gateway-system --timeout=120s || echo "Note: Envoy Gateway might already be ready from previous runs" | |
| # Check that the HTTPRoute is programmed | |
| echo "Checking HTTPRoute status..." | |
| task test-infra:kubectl -- get httproute milo-apiserver -n milo-system -o yaml | grep -A5 "status:" || true | |
| # Check that the service exists and has endpoints | |
| echo "Checking Milo API server service and endpoints..." | |
| task test-infra:kubectl -- get service milo-apiserver -n milo-system | |
| task test-infra:kubectl -- get endpoints milo-apiserver -n milo-system | |
| echo "✓ Networking verification complete" | |
| - name: Verify Milo API server connectivity | |
| run: | | |
| echo "=== Verifying Milo API server ===" | |
| # Check that kubeconfig exists | |
| if [ ! -f ".milo/kubeconfig" ]; then | |
| echo "ERROR: Milo kubeconfig not found at .milo/kubeconfig" | |
| exit 1 | |
| fi | |
| # Test API server health endpoint with brief retry | |
| echo "Testing Milo API server health..." | |
| RETRY_COUNT=0 | |
| MAX_RETRIES=6 # 6 retries * 5 seconds = 30 seconds | |
| while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do | |
| if task kubectl -- get --raw /healthz 2>/dev/null; then | |
| echo "✓ Milo API server is healthy" | |
| break | |
| else | |
| RETRY_COUNT=$((RETRY_COUNT + 1)) | |
| if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then | |
| echo "API server not responding yet (attempt $RETRY_COUNT/$MAX_RETRIES), waiting 5 seconds..." | |
| sleep 5 | |
| fi | |
| fi | |
| done | |
| if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then | |
| echo "ERROR: Milo API server health check failed after 30 seconds" | |
| echo "Debugging output:" | |
| task kubectl -- get --raw /healthz || true | |
| task test-infra:kubectl -- logs -n milo-system -l app.kubernetes.io/name=milo-apiserver --tail=20 | |
| exit 1 | |
| fi | |
| - name: Run end-to-end tests | |
| run: | | |
| echo "=== Running end-to-end tests ===" | |
| # Determine which tests to run based on input | |
| if [ -n "${{ github.event.inputs.test_suite }}" ]; then | |
| echo "Running specified test suite: ${{ github.event.inputs.test_suite }}" | |
| task test:end-to-end -- ${{ github.event.inputs.test_suite }} | |
| else | |
| echo "Running all end-to-end tests..." | |
| task test:end-to-end | |
| fi | |
| - name: Collect debug information on failure | |
| if: failure() | |
| run: | | |
| echo "=== Collecting debug information ===" | |
| # Cluster status | |
| echo "=== Infrastructure Cluster Status ===" | |
| task test-infra:kubectl -- get pods -A || true | |
| task test-infra:kubectl -- get nodes -o wide || true | |
| # Milo control plane status | |
| echo "=== Milo Control Plane Status ===" | |
| task test-infra:kubectl -- describe pods -n milo-system || true | |
| # Milo API server logs | |
| echo "=== Milo API Server Logs ===" | |
| task test-infra:kubectl -- logs -n milo-system -l app.kubernetes.io/name=milo-apiserver --tail=100 || true | |
| # Controller manager logs | |
| echo "=== Controller Manager Logs ===" | |
| task test-infra:kubectl -- logs -n milo-system -l app.kubernetes.io/name=milo-controller-manager --tail=100 || true | |
| # Docker container status | |
| echo "=== Docker Containers ===" | |
| docker ps -a || true | |
| # KinD cluster info | |
| echo "=== KinD cluster info ===" | |
| kind get clusters || true | |
| kind export logs /tmp/kind-logs --name $TEST_INFRA_CLUSTER_NAME || true | |
| - name: Upload debug artifacts | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: debug-logs | |
| path: | | |
| /tmp/kind-logs/ | |
| .milo/kubeconfig | |
| if-no-files-found: ignore | |
| - name: Cleanup test infrastructure | |
| if: always() | |
| run: | | |
| echo "=== Cleaning up test infrastructure ===" | |
| # Clean up test infrastructure cluster | |
| task test-infra:cluster-down || true | |
| # Verify cleanup | |
| echo "Remaining KinD clusters:" | |
| kind get clusters || true | |
| echo "Remaining Docker containers:" | |
| docker ps -a --filter "name=test-infra" || true |