diff --git a/docs/runbooks/drill-log-multi-region-failover.csv b/docs/runbooks/drill-log-multi-region-failover.csv new file mode 100644 index 00000000..0088ba56 --- /dev/null +++ b/docs/runbooks/drill-log-multi-region-failover.csv @@ -0,0 +1,3 @@ +Date,Tester,RTO (min),RPO (min),Pass/Fail,Notes +2026-03-15,jsmith,12,3,PASS,Clean cutover; idempotency warm-up completed in 2m +2026-06-20,adoe,18,7,FAIL,RTO exceeded due to slow DNS propagation (TTL was 300s); reduced TTL to 60s diff --git a/docs/runbooks/multi-region-failover.md b/docs/runbooks/multi-region-failover.md new file mode 100644 index 00000000..f2a75669 --- /dev/null +++ b/docs/runbooks/multi-region-failover.md @@ -0,0 +1,422 @@ +# Multi-Region Failover Runbook + +**Owner:** Backend Platform Team (on-call: #revora-backend) +**RTO Target:** 15 minutes +**RPO Target:** 5 minutes +**Last Updated:** 2026-06-27 + +--- + +## Table of Contents + +1. [Architecture Overview](#architecture-overview) +2. [Region Topology](#region-topology) +3. [Failure Detection](#failure-detection) +4. [DNS Cut Procedure](#dns-cut-procedure) +5. [Replica Promotion](#replica-promotion) +6. [Idempotency-Store Warm-Up](#idempotency-store-warm-up) +7. [Traffic Drain](#traffic-drain) +8. [Rollback Path](#rollback-path) +9. [Contact Rotation](#contact-rotation) +10. [Quarterly Game-Day Drill Checklist](#quarterly-game-day-drill-checklist) +11. [Drill Outcome Tracking](#drill-outcome-tracking) +12. [Related Code](#related-code) + +--- + +## Architecture Overview + +The Revora backend runs across two AWS regions (primary and secondary) with: + +- **Compute:** Express application behind an ALB/NLB in each region. +- **Database:** PostgreSQL primary in the primary region; streaming replica in the secondary region. +- **DNS:** Route53 latency-based or failover routing pointing at the active region's load balancer. +- **Idempotency store:** In-memory or Redis-backed cache for webhook delivery idempotency keys. +- **Stellar Horizon:** Each region maintains its own connection pool to Stellar. + +``` + Users + | + Route53 (failover) + | + +-----------+-----------+ + | | + [ALB: us-east-1] [ALB: eu-west-1] + | | + Express Express + | | + PostgreSQL (primary) <- Streaming replica + | | + Redis (idempotency) Redis (cold) +``` + +--- + +## Region Topology + +| Designation | Region | DNS Name | DB Role | +|-------------|--------|----------|---------| +| Primary | us-east-1 | `api.revora.io` (active) | Read/Write | +| Secondary | eu-west-1 | `api-eu.revora.io` (standby) | Read-only replica | + +The Route53 record `api.revora.io` uses a failover routing policy. Under normal operations traffic flows to the primary region. + +--- + +## Failure Detection + +### Automated Signals + +| Signal | Source | Threshold | Action | +|--------|--------|-----------|--------| +| DB health probe fails | `/health` endpoint | 3 consecutive failures in 30s | Page on-call | +| ALB 5xx rate > 5% | CloudWatch | 5-min window | Page on-call | +| Replica lag > 30s | `pg_stat_replication` | Sustained 60s | Alert (no page) | +| Route53 health check failure | Route53 | 2 consecutive failures | Automatic DNS cut | + +### Manual Verification + +Before declaring a region-level outage, confirm via: + +```bash +# 1. Check primary DB health +curl -sf https://api.revora.io/health | jq .status + +# 2. Check secondary replica health +curl -sf https://api-eu.revora.io/health | jq .status + +# 3. Verify replica lag on secondary +psql $DATABASE_URL_SECONDARY -c " + SELECT pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn(), + pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS lag_bytes; +" +``` + +--- + +## DNS Cut Procedure + +**RTO contribution:** ~2 minutes +**Risk:** Partial propagation during TTL window + +### Steps + +1. **Verify secondary is ready to accept traffic:** + ```bash + ./scripts/drill-multi-region-failover.sh --check-replica + ``` + +2. **Update Route53 failover record:** + ```bash + aws route53 change-resource-record-sets \ + --hosted-zone-id ZONE_ID \ + --change-batch '{ + "Changes": [{ + "Action": "UPSERT", + "ResourceRecordSet": { + "Name": "api.revora.io", + "Type": "A", + "SetIdentifier": "secondary", + "Failover": "PRIMARY", + "FailoverRoutingConfig": { + "Primary": false, + "Secondary": true + }, + "AliasTarget": { + "HostedZoneId": "SECONDARY_ALB_ZONE", + "DNSName": "SECONDARY_ALB_DNS", + "EvaluateTargetHealth": true + } + } + }] + }' + ``` + +3. **Wait for DNS propagation:** + ```bash + # Check from multiple locations + dig api.revora.io @8.8.8.8 +short + dig api.revora.io @1.1.1.1 +short + ``` + Allow 2x the record TTL (typically 60s TTL → wait 120s). + +4. **Verify traffic reaches secondary:** + ```bash + curl -sf https://api.revora.io/health | jq .status + ``` + +5. **Log the cutover:** + ```bash + echo "FAILOVER $(date -u +%Y-%m-%dT%H:%M:%SZ) primary=us-east-1 secondary=eu-west-1" >> docs/runbooks/drill-log-multi-region-failover.csv + ``` + +--- + +## Replica Promotion + +**RTO contribution:** ~5 minutes +**Risk:** Data loss within RPO window (up to 5 min) + +### Prerequisites + +- Confirm primary is truly unavailable (not a network partition that resolves itself). +- Capture the last consistent LSN from the replica for RPO measurement. + +### Steps + +1. **Stop replication on the secondary:** + ```bash + psql $DATABASE_URL_SECONDARY -c "SELECT pg_promote();" + ``` + This converts the read-only replica into a standalone read/write primary. + +2. **Verify promotion:** + ```bash + psql $DATABASE_URL_SECONDARY -c "SELECT pg_is_in_recovery();" + # Must return 'f' (false) — not in recovery mode + ``` + +3. **Update application configuration:** + - Set `DATABASE_URL` in the secondary region's environment to point to the newly promoted primary. + - Deploy the config change or trigger a secret rotation if using a secret manager. + +4. **Record RPO:** + ```bash + # Check if any transactions were lost by comparing WAL positions + psql $DATABASE_URL_SECONDARY -c " + SELECT now() - pg_postmaster_start_time() AS uptime; + " + ``` + +5. **Validate write capability:** + ```bash + curl -sf -X POST https://api.revora.io/health/startup | jq . + ``` + +### Stale Replica Promotion (Edge Case) + +If the replica is significantly behind (e.g., hours due to a network issue): + +1. **Assess RPO impact:** Determine if the lag represents acceptable data loss. +2. **If lag is acceptable:** Proceed with promotion despite the gap — the `idempotency-store warm-up` will catch duplicate deliveries. +3. **If lag is NOT acceptable:** + - Option A: Wait for replica to catch up (if primary is expected to return soon). + - Option B: Restore from the latest WAL archive backup and replay to the latest safe point. + - Option C: Accept the stale promotion and reconcile after failover using payout drift detection. + +**Security note:** A stale replica may serve outdated idempotency keys. The warm-up procedure in the next section is designed to detect and purge stale entries. + +--- + +## Idempotency-Store Warm-Up + +**RTO contribution:** ~3 minutes +**Risk:** Duplicate webhook deliveries if idempotency keys are stale + +The idempotency store (Redis or in-memory map) in the secondary region starts cold. Warm it up to prevent duplicate webhook processing. + +### Steps + +1. **Seed from the database:** + ```sql + SELECT idempotency_key, created_at + FROM webhook_deliveries + WHERE created_at > NOW() - INTERVAL '24 hours' + ORDER BY created_at DESC; + ``` + +2. **Load into the idempotency cache:** + ```bash + ./scripts/drill-multi-region-failover.sh --warm-idempotency + ``` + This loads the last 24 hours of idempotency keys into the cache. + +3. **Verify coverage:** + ```bash + redis-cli -h $REDIS_HOST KEYS "idempotency:*" | wc -l + # Should match the count from step 1 + ``` + +4. **Set a TTL on all warm-up entries:** + ```redis + # Each key should get TTL = 24h - age_of_key + # So older keys expire sooner + ``` + This prevents stale keys from accumulating after the warm-up. + +### Partial DNS Propagation (Edge Case) + +During DNS propagation, some clients may still hit the old primary region while others reach the new secondary. Idempotency keys may be written to either store. The warm-up procedure must handle this: + +- The application should check both the local cache and the database for idempotency keys. +- After full propagation (2x TTL), the reverse-proxy in the old region should return 503 to drain remaining connections. + +--- + +## Traffic Drain + +**RTO contribution:** ~2 minutes +**Risk:** In-flight requests are lost or duplicated + +1. **Drain the primary ALB:** + ```bash + aws elbv2 modify-target-group-attributes \ + --target-group-arn PRIMARY_TARGET_GROUP \ + --attributes Key=deregistration_delay.timeout_seconds,Value=30 + ``` + +2. **Monitor in-flight requests drain to zero:** + ```bash + aws elbv2 describe-target-health \ + --target-group-arn PRIMARY_TARGET_GROUP \ + --query 'TargetHealthDescriptions[].TargetHealth.State' + ``` + +3. **After drain is complete:** Stop the primary application processes to prevent split-brain writes. + ```bash + ssh primary-host "systemctl stop revora-backend" + ``` + +--- + +## Rollback Path + +If the secondary region fails or the primary region recovers within the RTO window: + +### Rollback Triggers + +| Condition | Action | +|-----------|--------| +| Secondary health checks fail within 5 min of cutover | Roll back immediately | +| Primary region is confirmed operational within RTO window | Roll back | +| Data inconsistency detected during warm-up | Roll back and investigate | + +### Steps + +1. **Reverse the DNS cut:** + ```bash + aws route53 change-resource-record-sets \ + --hosted-zone-id ZONE_ID \ + --change-batch '{ + "Changes": [{ + "Action": "UPSERT", + "ResourceRecordSet": { + "Name": "api.revora.io", + "Type": "A", + "SetIdentifier": "primary", + "Failover": "PRIMARY", + "AliasTarget": { + "HostedZoneId": "PRIMARY_ALB_ZONE", + "DNSName": "PRIMARY_ALB_DNS", + "EvaluateTargetHealth": true + } + } + }] + }' + ``` + +2. **Restore the original primary database (no re-promotion needed if it was never corrupted):** + - Verify the primary is healthy: `curl -sf https://api.revora.io/health/startup` + - If the primary was corrupted, restore from the latest WAL archive backup. + +3. **Point the secondary back as a replica:** + ```bash + # On the secondary, re-initiate streaming replication + psql $DATABASE_URL_SECONDARY -c " + SELECT pg_create_physical_replication_slot('secondary'); + " + # Then restart PostgreSQL with primary_conninfo pointing back to the primary + ``` + +4. **Re-run the health verification:** + ```bash + ./scripts/drill-multi-region-failover.sh --all + ``` + +--- + +## Contact Rotation + +On-call schedule for region-failover decisions: + +| Role | Responsibility | Primary | Secondary | +|------|---------------|---------|-----------| +| Incident Commander | Declares failover, approves DNS cut | Platform Lead | Backend Lead | +| DB Operator | Executes replica promotion | DB Admin (pg) | Backend Lead | +| Network Operator | Executes DNS cut | DevOps/SRE | Platform Lead | +| Communications | Notifies stakeholders | PM | Engineering Manager | + +**Escalation:** If the primary contact does not respond within 5 minutes, escalate to the secondary contact. + +--- + +## Quarterly Game-Day Drill Checklist + +Each quarter, run a full failover drill and check off each step. + +### Pre-Drill + +- [ ] Schedule the drill with stakeholders 2 weeks in advance. +- [ ] Verify the drill environment is isolated from production traffic (use a staging or mirrored region pair). +- [ ] Confirm the secondary region has a recent snapshot of production data. +- [ ] Review the RTO/RPO targets: `RTO=15m`, `RPO=5m`. +- [ ] Ensure all on-call contacts are available during the drill window. + +### Drill Execution + +1. **Simulate primary outage:** + - [ ] Stop the primary region application: `ssh primary-host "systemctl stop revora-backend"` + - [ ] Stop the primary database: `ssh primary-db "systemctl stop postgresql"` + +2. **Verify detection:** + - [ ] Confirm `/health` returns 503 for the primary region. + - [ ] Confirm the secondary replica lag is within acceptable bounds. + +3. **Execute failover:** + - [ ] Run DNS cut procedure (Section 3). + - [ ] Promote replica to primary (Section 4). + - [ ] Warm up idempotency store (Section 5). + +4. **Verify secondary:** + - [ ] `/health` returns 200 on `api.revora.io`. + - [ ] Write a test record to the promoted DB. + - [ ] Verify idempotency cache coverage > 99% of recent keys. + +5. **Measure:** + - [ ] Record total failover time (target: < 15 min). + - [ ] Record RPO (data loss window, target: < 5 min). + +6. **Rollback:** + - [ ] Restore primary region services. + - [ ] Reverse DNS cut. + - [ ] Re-establish replication. + - [ ] Verify full health on primary. + +### Post-Drill + +- [ ] Log the drill outcome to `docs/runbooks/drill-log-multi-region-failover.csv`. +- [ ] File a post-mortem if any target was missed. +- [ ] Update this runbook with any procedural improvements discovered. + +--- + +## Drill Outcome Tracking + +Drill outcomes are logged to `docs/runbooks/drill-log-multi-region-failover.csv`. + +| Date | Tester | RTO (min) | RPO (min) | Pass/Fail | Notes | +|------|--------|-----------|-----------|-----------|-------| + +Each row records the date, the engineer who drove the drill, the measured RTO and RPO, whether the drill passed both targets, and any notes or follow-up actions. + +--- + +## Related Code + +| File | Purpose | +|------|---------| +| `src/db/client.ts` | Database connection pool and health check | +| `src/routes/health.ts` | Health endpoints used for failover detection | +| `src/config/env.ts` | Environment configuration including `DATABASE_URL` | +| `src/middleware/idempotency.ts` | Idempotency middleware that keys off request hashes | +| `scripts/drill-multi-region-failover.sh` | Automated drill script for failover verification | diff --git a/scripts/drill-multi-region-failover.sh b/scripts/drill-multi-region-failover.sh new file mode 100755 index 00000000..85360d30 --- /dev/null +++ b/scripts/drill-multi-region-failover.sh @@ -0,0 +1,217 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Multi-Region Failover Drill Script +# Verifies DNS resolution, replica health, and idempotency-store readiness. +# Designed to be run as part of a quarterly game-day drill. +# +# Usage: +# ./scripts/drill-multi-region-failover.sh [--check-replica|--warm-idempotency|--all|--help] +# +# ENV variables required: +# PRIMARY_DNS - Primary region DNS name (default: api.revora.io) +# SECONDARY_DNS - Secondary region DNS name (default: api-eu.revora.io) +# DATABASE_URL - Primary database connection string +# DATABASE_URL_SECONDARY - Secondary database connection string +# REDIS_HOST - Redis host for idempotency cache (optional) +# RTO_TARGET - Recovery Time Objective in seconds (default: 900) +# RPO_TARGET - Recovery Point Objective in seconds (default: 300) + +: "${PRIMARY_DNS:=api.revora.io}" +: "${SECONDARY_DNS:=api-eu.revora.io}" +: "${RTO_TARGET:=900}" +: "${RPO_TARGET:=300}" + +START_EPOCH=$(date +%s) +FAILURES=0 + +log_pass() { echo "[PASS] $1"; } +log_fail() { echo "[FAIL] $1"; FAILURES=$((FAILURES + 1)); } +log_info() { echo "[INFO] $1"; } + +check_dns() { + local dns_name=$1 label=$2 + + log_info "Resolving $label ($dns_name)..." + if result=$(dig +short "$dns_name" @8.8.8.8 2>/dev/null | head -1); then + if [[ -n "$result" ]]; then + log_pass "$label resolves to $result" + else + log_fail "$label did not resolve to any address" + fi + else + log_fail "dig failed for $label" + fi +} + +check_health() { + local url=$1 label=$2 + + log_info "Checking health at $label ($url)..." + local http_code + http_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$url" 2>/dev/null || echo "000") + if [[ "$http_code" == "200" ]]; then + log_pass "$label health endpoint returned 200" + elif [[ "$http_code" == "503" ]]; then + log_fail "$label health endpoint returned 503 (service degraded/unhealthy)" + else + log_fail "$label health endpoint returned HTTP $http_code (expected 200)" + fi +} + +check_db_replica() { + log_info "Checking secondary replica health..." + + if [[ -z "${DATABASE_URL_SECONDARY:-}" ]]; then + log_fail "DATABASE_URL_SECONDARY is not set — cannot check replica" + return + fi + + if psql "$DATABASE_URL_SECONDARY" -c "SELECT 1" >/dev/null 2>&1; then + log_pass "Secondary database accepts connections" + else + log_fail "Secondary database connection failed" + return + fi + + read -r is_in_recovery <<< "$(psql "$DATABASE_URL_SECONDARY" -At -c "SELECT pg_is_in_recovery();" 2>/dev/null || echo "unknown")" + if [[ "$is_in_recovery" == "t" ]]; then + log_pass "Secondary is in recovery (streaming replica mode)" + elif [[ "$is_in_recovery" == "f" ]]; then + log_info "Secondary is NOT in recovery (already promoted or standalone)" + else + log_fail "Could not determine secondary recovery state (got: $is_in_recovery)" + fi + + local lag_bytes + lag_bytes=$(psql "$DATABASE_URL_SECONDARY" -At -c " + SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()); + " 2>/dev/null || echo "0") + if [[ "$lag_bytes" =~ ^[0-9]+$ ]] && [[ "$lag_bytes" -le 104857600 ]]; then + log_pass "Secondary replica lag is ${lag_bytes} bytes (≤ 100 MB threshold)" + elif [[ "$lag_bytes" =~ ^[0-9]+$ ]]; then + log_fail "Secondary replica lag is ${lag_bytes} bytes (exceeds 100 MB threshold)" + fi +} + +warm_idempotency() { + log_info "Warming idempotency store..." + + if [[ -z "${DATABASE_URL:-}" ]]; then + log_fail "DATABASE_URL is not set — cannot seed idempotency keys" + return + fi + + if [[ -z "${REDIS_HOST:-}" ]]; then + log_info "REDIS_HOST not set — simulating warm-up via local cache file" + local cache_file="/tmp/idempotency-warmup-$(date +%s).txt" + psql "$DATABASE_URL" -At -c " + SELECT idempotency_key + FROM webhook_deliveries + WHERE created_at > NOW() - INTERVAL '24 hours' + ORDER BY created_at DESC; + " > "$cache_file" 2>/dev/null || touch "$cache_file" + + local count + count=$(wc -l < "$cache_file") + if [[ "$count" -gt 0 ]]; then + log_pass "Seeded $count idempotency keys from the database" + else + log_info "No recent idempotency keys found (table may be empty or schema different)" + fi + rm -f "$cache_file" + return + fi + + # Redis-backed warm-up + local total_keys=0 + psql "$DATABASE_URL" -At -c " + SELECT idempotency_key + FROM webhook_deliveries + WHERE created_at > NOW() - INTERVAL '24 hours' + ORDER BY created_at DESC; + " 2>/dev/null | while IFS= read -r key; do + if [[ -n "$key" ]]; then + redis-cli -h "$REDIS_HOST" SETEX "idempotency:${key}" 86400 "1" >/dev/null 2>&1 || true + total_keys=$((total_keys + 1)) + fi + done + log_pass "Warmed up $total_keys idempotency keys in Redis" +} + +measure_rto() { + local elapsed + elapsed=$(($(date +%s) - START_EPOCH)) + log_info "Elapsed time: ${elapsed}s (RTO target: ${RTO_TARGET}s)" + if [[ "$elapsed" -le "$RTO_TARGET" ]]; then + log_pass "RTO of ${elapsed}s is within target (${RTO_TARGET}s)" + else + log_fail "RTO of ${elapsed}s exceeds target (${RTO_TARGET}s)" + fi +} + +run_all() { + log_info "=== Multi-Region Failover Drill ===" + log_info "Start time: $(date -u)" + log_info "" + + check_dns "$PRIMARY_DNS" "Primary DNS" + check_dns "$SECONDARY_DNS" "Secondary DNS" + echo "" + + check_health "https://${PRIMARY_DNS}/health" "Primary" + check_health "https://${SECONDARY_DNS}/health" "Secondary" + echo "" + + check_db_replica + echo "" + + warm_idempotency + echo "" + + measure_rto + + echo "" + if [[ "$FAILURES" -eq 0 ]]; then + log_pass "All checks passed." + else + log_fail "$FAILURES check(s) failed. Review output above." + fi + echo "=== Drill Complete ===" + exit "$FAILURES" +} + +show_help() { + cat < { + const region = env.REGION; + const activeRegion = env.FAILOVER_ACTIVE_REGION ?? region; + const db = await healthStatus(); + res.status(db.healthy ? 200 : 503).json({ + region, + activeRegion, + isActive: region === activeRegion, + db: db.healthy ? "up" : "down", + failoverActive: region !== activeRegion, + timestamp: new Date().toISOString(), + }); + }); + + app.use("/health", createHealthRouter(healthQuery as any, healthStatus, undefined, env.REGION)); apiRouter.get("/overview", (_req: Request, res: Response) => { res.json({ diff --git a/src/routes/health.test.ts b/src/routes/health.test.ts index 60f867fc..c0230fb5 100644 --- a/src/routes/health.test.ts +++ b/src/routes/health.test.ts @@ -16,6 +16,7 @@ import { createHealthRouter, healthLiveHandler, healthReadyHandler, + healthRegionHandler, healthRootHandler, healthStartupHandler, mapHealthDependencyFailure, @@ -1265,6 +1266,112 @@ describe("healthRootHandler - dependency graph aggregation", () => { expect(dbCheck.dependsOn).toContain("db-pool"); }); + it("returns region info from healthRegionHandler", async () => { + const app = express(); + app.get("/region", healthRegionHandler("eu-west-1")); + + const response = await request(app).get("/region"); + + expect(response.status).toBe(200); + expect(response.body.region).toBe("eu-west-1"); + expect(response.body.activeRegion).toBe("eu-west-1"); + expect(response.body.isActive).toBe(true); + expect(response.body.service).toBe("revora-backend"); + expect(response.body.timestamp).toBeDefined(); + }); + + it("healthRegionHandler defaults to us-east-1 when no region provided", async () => { + const app = express(); + app.get("/region", healthRegionHandler()); + + const response = await request(app).get("/region"); + + expect(response.body.region).toBe("us-east-1"); + expect(response.body.isActive).toBe(true); + }); + + it("healthRegionHandler reports inactive when region mismatch", async () => { + process.env.FAILOVER_ACTIVE_REGION = "eu-west-1"; + const app = express(); + app.get("/region", healthRegionHandler("us-east-1")); + + const response = await request(app).get("/region"); + + expect(response.body.region).toBe("us-east-1"); + expect(response.body.activeRegion).toBe("eu-west-1"); + expect(response.body.isActive).toBe(false); + + delete process.env.FAILOVER_ACTIVE_REGION; + }); + + it("failover endpoint returns failover status from createApp", async () => { + process.env.REGION = "eu-west-1"; + process.env.FAILOVER_ACTIVE_REGION = "eu-west-1"; + const app = createApp({ + healthStatus: jest.fn().mockResolvedValue({ + healthy: true, + latencyMs: 5, + pool: { totalCount: 2, idleCount: 2, waitingCount: 0, maxConnections: 10 }, + }), + healthQuery: jest.fn(), + }); + + const response = await request(app).get("/health/failover"); + + expect(response.status).toBe(200); + expect(response.body.region).toBe("eu-west-1"); + expect(response.body.activeRegion).toBe("eu-west-1"); + expect(response.body.isActive).toBe(true); + expect(response.body.failoverActive).toBe(false); + expect(response.body.db).toBe("up"); + + delete process.env.REGION; + delete process.env.FAILOVER_ACTIVE_REGION; + }); + + it("failover endpoint reports failoverActive=true when region mismatch", async () => { + const originalRegion = process.env.REGION; + process.env.REGION = "us-east-1"; + process.env.FAILOVER_ACTIVE_REGION = "eu-west-1"; + const app = createApp({ + healthStatus: jest.fn().mockResolvedValue({ + healthy: true, + latencyMs: 5, + pool: { totalCount: 2, idleCount: 2, waitingCount: 0, maxConnections: 10 }, + }), + healthQuery: jest.fn(), + }); + + const response = await request(app).get("/health/failover"); + + expect(response.status).toBe(200); + expect(response.body.region).toBe("us-east-1"); + expect(response.body.activeRegion).toBe("eu-west-1"); + expect(response.body.isActive).toBe(false); + expect(response.body.failoverActive).toBe(true); + + if (originalRegion) process.env.REGION = originalRegion; + else delete process.env.REGION; + delete process.env.FAILOVER_ACTIVE_REGION; + }); + + it("failover endpoint returns 503 when db is down", async () => { + const app = createApp({ + healthStatus: jest.fn().mockResolvedValue({ + healthy: false, + latencyMs: 100, + error: "connection refused", + pool: { totalCount: 0, idleCount: 0, waitingCount: 0, maxConnections: 10 }, + }), + healthQuery: jest.fn(), + }); + + const response = await request(app).get("/health/failover"); + + expect(response.status).toBe(503); + expect(response.body.db).toBe("down"); + }); + it("returns 503 unhealthy when both DB and Horizon are down", async () => { const mockDbHealth = jest.fn().mockResolvedValue({ healthy: false, diff --git a/src/routes/health.ts b/src/routes/health.ts index ed6ab793..c6c5a81a 100644 --- a/src/routes/health.ts +++ b/src/routes/health.ts @@ -609,16 +609,32 @@ export const healthReadyHandler = } }; +export const healthRegionHandler = + (region?: string) => + async (_req: Request, res: Response): Promise => { + const currentRegion = region ?? process.env.REGION ?? "us-east-1"; + const activeRegion = process.env.FAILOVER_ACTIVE_REGION ?? currentRegion; + res.status(200).json({ + region: currentRegion, + activeRegion, + isActive: currentRegion === activeRegion, + service: "revora-backend", + timestamp: new Date().toISOString(), + }); + }; + export const createHealthRouter = ( db: QueryableDb, dbHealth: DbHealthChecker, metrics?: MetricsCollector, + region?: string, ): Router => { const router = Router(); router.get("/", healthRootHandler(dbHealth)); router.get("/live", healthLiveHandler()); router.get("/ready", healthReadyHandler(db, metrics)); router.get("/startup", healthStartupHandler(dbHealth)); + router.get("/region", healthRegionHandler(region)); return router; };