Skip to content

Add runbook for ProjectStuckCreatingSLOViolation alert #1048

Add runbook for ProjectStuckCreatingSLOViolation alert

Add runbook for ProjectStuckCreatingSLOViolation alert #1048

name: Test Environment Setup and Validation
on:
workflow_dispatch:
inputs:
test_suite:
description: 'Test suite to run (e.g., quota, group, or empty for all)'
required: false
default: ''
type: string
pull_request: {}
env:
# Enable experimental remote taskfiles feature
TASK_X_REMOTE_TASKFILES: 1
# Test infrastructure configuration
TEST_INFRA_CLUSTER_NAME: test-infra
MILO_IMAGE_NAME: ghcr.io/datum-cloud/milo
MILO_IMAGE_TAG: dev
jobs:
test-environment-validation:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
cache: true
- name: Install Task CLI
run: |
sh -c "$(curl --location https://taskfile.dev/install.sh)" -- -d -b /usr/local/bin
- name: Verify Task installation
run: |
task --version
echo "Available tasks:"
task --list
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
buildkitd-config-inline: |
[worker.oci]
max-parallelism = 4
- name: Install kubectl
uses: azure/setup-kubectl@v4
with:
version: 'v1.30.0'
- name: Install KinD
uses: helm/kind-action@v1
with:
install_only: true
version: v0.24.0
- name: Verify prerequisites
run: |
echo "=== Checking prerequisites ==="
docker version
kubectl version --client
kind version
echo "Go version: $(go version)"
- name: Build Milo container image with caching
uses: docker/build-push-action@v5
with:
context: .
push: false
load: true # Load the image into Docker daemon for KinD
tags: ${{ env.MILO_IMAGE_NAME }}:${{ env.MILO_IMAGE_TAG }}
cache-from: type=gha
cache-to: type=gha,mode=max
platforms: linux/amd64
build-args: |
GOPROXY=https://proxy.golang.org,direct
- name: Set up test infrastructure cluster
run: |
echo "Setting up test infrastructure cluster (using pre-built image)..."
# Create the KinD cluster
task test-infra:cluster-up
# Generate required code
task generate:code
# Load the pre-built image (will skip build since image exists)
task dev:load
# Deploy Milo control plane (dependencies already satisfied)
task dev:deploy
- name: Wait for networking path to be ready
run: |
echo "=== Waiting for networking path to be ready ==="
# Quick verification that KinD cluster is running
docker ps --filter "name=test-infra"
# Verify all Milo components are running
echo "Checking Milo control plane components:"
task test-infra:kubectl -- get pods -n milo-system
# Wait for Envoy Gateway to be ready (required for API server access)
echo "⏳ Waiting for Envoy Gateway to be ready..."
task test-infra:kubectl -- wait --for=condition=Ready pod -l app.kubernetes.io/name=envoy-gateway -n envoy-gateway-system --timeout=120s || echo "Note: Envoy Gateway might already be ready from previous runs"
# Check that the HTTPRoute is programmed
echo "Checking HTTPRoute status..."
task test-infra:kubectl -- get httproute milo-apiserver -n milo-system -o yaml | grep -A5 "status:" || true
# Check that the service exists and has endpoints
echo "Checking Milo API server service and endpoints..."
task test-infra:kubectl -- get service milo-apiserver -n milo-system
task test-infra:kubectl -- get endpoints milo-apiserver -n milo-system
echo "✓ Networking verification complete"
- name: Verify Milo API server connectivity
run: |
echo "=== Verifying Milo API server ==="
# Check that kubeconfig exists
if [ ! -f ".milo/kubeconfig" ]; then
echo "ERROR: Milo kubeconfig not found at .milo/kubeconfig"
exit 1
fi
# Test API server health endpoint with brief retry
echo "Testing Milo API server health..."
RETRY_COUNT=0
MAX_RETRIES=6 # 6 retries * 5 seconds = 30 seconds
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
if task kubectl -- get --raw /healthz 2>/dev/null; then
echo "✓ Milo API server is healthy"
break
else
RETRY_COUNT=$((RETRY_COUNT + 1))
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
echo "API server not responding yet (attempt $RETRY_COUNT/$MAX_RETRIES), waiting 5 seconds..."
sleep 5
fi
fi
done
if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then
echo "ERROR: Milo API server health check failed after 30 seconds"
echo "Debugging output:"
task kubectl -- get --raw /healthz || true
task test-infra:kubectl -- logs -n milo-system -l app.kubernetes.io/name=milo-apiserver --tail=20
exit 1
fi
- name: Run end-to-end tests
run: |
echo "=== Running end-to-end tests ==="
# Determine which tests to run based on input
if [ -n "${{ github.event.inputs.test_suite }}" ]; then
echo "Running specified test suite: ${{ github.event.inputs.test_suite }}"
task test:end-to-end -- ${{ github.event.inputs.test_suite }}
else
echo "Running all end-to-end tests..."
task test:end-to-end
fi
- name: Collect debug information on failure
if: failure()
run: |
echo "=== Collecting debug information ==="
# Cluster status
echo "=== Infrastructure Cluster Status ==="
task test-infra:kubectl -- get pods -A || true
task test-infra:kubectl -- get nodes -o wide || true
# Milo control plane status
echo "=== Milo Control Plane Status ==="
task test-infra:kubectl -- describe pods -n milo-system || true
# Milo API server logs
echo "=== Milo API Server Logs ==="
task test-infra:kubectl -- logs -n milo-system -l app.kubernetes.io/name=milo-apiserver --tail=100 || true
# Controller manager logs
echo "=== Controller Manager Logs ==="
task test-infra:kubectl -- logs -n milo-system -l app.kubernetes.io/name=milo-controller-manager --tail=100 || true
# Docker container status
echo "=== Docker Containers ==="
docker ps -a || true
# KinD cluster info
echo "=== KinD cluster info ==="
kind get clusters || true
kind export logs /tmp/kind-logs --name $TEST_INFRA_CLUSTER_NAME || true
- name: Upload debug artifacts
if: failure()
uses: actions/upload-artifact@v4
with:
name: debug-logs
path: |
/tmp/kind-logs/
.milo/kubeconfig
if-no-files-found: ignore
- name: Cleanup test infrastructure
if: always()
run: |
echo "=== Cleaning up test infrastructure ==="
# Clean up test infrastructure cluster
task test-infra:cluster-down || true
# Verify cleanup
echo "Remaining KinD clusters:"
kind get clusters || true
echo "Remaining Docker containers:"
docker ps -a --filter "name=test-infra" || true