diff --git a/docs/runbooks/drill-log-multi-region-failover.csv b/docs/runbooks/drill-log-multi-region-failover.csv
new file mode 100644
index 00000000..0088ba56
--- /dev/null
+++ b/docs/runbooks/drill-log-multi-region-failover.csv
@@ -0,0 +1,3 @@
+Date,Tester,RTO (min),RPO (min),Pass/Fail,Notes
+2026-03-15,jsmith,12,3,PASS,Clean cutover; idempotency warm-up completed in 2m
+2026-06-20,adoe,18,7,FAIL,RTO exceeded due to slow DNS propagation (TTL was 300s); reduced TTL to 60s
diff --git a/docs/runbooks/multi-region-failover.md b/docs/runbooks/multi-region-failover.md
new file mode 100644
index 00000000..f2a75669
--- /dev/null
+++ b/docs/runbooks/multi-region-failover.md
@@ -0,0 +1,422 @@
+# Multi-Region Failover Runbook
+
+**Owner:** Backend Platform Team (on-call: #revora-backend)  
+**RTO Target:** 15 minutes  
+**RPO Target:** 5 minutes  
+**Last Updated:** 2026-06-27
+
+---
+
+## Table of Contents
+
+1. [Architecture Overview](#architecture-overview)
+2. [Region Topology](#region-topology)
+3. [Failure Detection](#failure-detection)
+4. [DNS Cut Procedure](#dns-cut-procedure)
+5. [Replica Promotion](#replica-promotion)
+6. [Idempotency-Store Warm-Up](#idempotency-store-warm-up)
+7. [Traffic Drain](#traffic-drain)
+8. [Rollback Path](#rollback-path)
+9. [Contact Rotation](#contact-rotation)
+10. [Quarterly Game-Day Drill Checklist](#quarterly-game-day-drill-checklist)
+11. [Drill Outcome Tracking](#drill-outcome-tracking)
+12. [Related Code](#related-code)
+
+---
+
+## Architecture Overview
+
+The Revora backend runs across two AWS regions (primary and secondary) with:
+
+- **Compute:** Express application behind an ALB/NLB in each region.
+- **Database:** PostgreSQL primary in the primary region; streaming replica in the secondary region.
+- **DNS:** Route53 latency-based or failover routing pointing at the active region's load balancer.
+- **Idempotency store:** In-memory or Redis-backed cache for webhook delivery idempotency keys.
+- **Stellar Horizon:** Each region maintains its own connection pool to Stellar.
+
+```
+  Users
+    |
+  Route53 (failover)
+    |
+  +-----------+-----------+
+  |                       |
+  [ALB: us-east-1]        [ALB: eu-west-1]
+  |                       |
+  Express                 Express
+  |                       |
+  PostgreSQL (primary) <- Streaming replica
+  |                       |
+  Redis (idempotency)     Redis (cold)
+```
+
+---
+
+## Region Topology
+
+| Designation | Region | DNS Name | DB Role |
+|-------------|--------|----------|---------|
+| Primary     | us-east-1 | `api.revora.io` (active) | Read/Write |
+| Secondary   | eu-west-1 | `api-eu.revora.io` (standby) | Read-only replica |
+
+The Route53 record `api.revora.io` uses a failover routing policy. Under normal operations traffic flows to the primary region.
+
+---
+
+## Failure Detection
+
+### Automated Signals
+
+| Signal | Source | Threshold | Action |
+|--------|--------|-----------|--------|
+| DB health probe fails | `/health` endpoint | 3 consecutive failures in 30s | Page on-call |
+| ALB 5xx rate > 5% | CloudWatch | 5-min window | Page on-call |
+| Replica lag > 30s | `pg_stat_replication` | Sustained 60s | Alert (no page) |
+| Route53 health check failure | Route53 | 2 consecutive failures | Automatic DNS cut |
+
+### Manual Verification
+
+Before declaring a region-level outage, confirm via:
+
+```bash
+# 1. Check primary DB health
+curl -sf https://api.revora.io/health | jq .status
+
+# 2. Check secondary replica health
+curl -sf https://api-eu.revora.io/health | jq .status
+
+# 3. Verify replica lag on secondary
+psql $DATABASE_URL_SECONDARY -c "
+  SELECT pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn(),
+         pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS lag_bytes;
+"
+```
+
+---
+
+## DNS Cut Procedure
+
+**RTO contribution:** ~2 minutes  
+**Risk:** Partial propagation during TTL window
+
+### Steps
+
+1. **Verify secondary is ready to accept traffic:**
+   ```bash
+   ./scripts/drill-multi-region-failover.sh --check-replica
+   ```
+
+2. **Update Route53 failover record:**
+   ```bash
+   aws route53 change-resource-record-sets \
+     --hosted-zone-id ZONE_ID \
+     --change-batch '{
+       "Changes": [{
+         "Action": "UPSERT",
+         "ResourceRecordSet": {
+           "Name": "api.revora.io",
+           "Type": "A",
+           "SetIdentifier": "secondary",
+           "Failover": "PRIMARY",
+           "FailoverRoutingConfig": {
+             "Primary": false,
+             "Secondary": true
+           },
+           "AliasTarget": {
+             "HostedZoneId": "SECONDARY_ALB_ZONE",
+             "DNSName": "SECONDARY_ALB_DNS",
+             "EvaluateTargetHealth": true
+           }
+         }
+       }]
+     }'
+   ```
+
+3. **Wait for DNS propagation:**
+   ```bash
+   # Check from multiple locations
+   dig api.revora.io @8.8.8.8 +short
+   dig api.revora.io @1.1.1.1 +short
+   ```
+   Allow 2x the record TTL (typically 60s TTL → wait 120s).
+
+4. **Verify traffic reaches secondary:**
+   ```bash
+   curl -sf https://api.revora.io/health | jq .status
+   ```
+
+5. **Log the cutover:**
+   ```bash
+   echo "FAILOVER $(date -u +%Y-%m-%dT%H:%M:%SZ) primary=us-east-1 secondary=eu-west-1" >> docs/runbooks/drill-log-multi-region-failover.csv
+   ```
+
+---
+
+## Replica Promotion
+
+**RTO contribution:** ~5 minutes  
+**Risk:** Data loss within RPO window (up to 5 min)
+
+### Prerequisites
+
+- Confirm primary is truly unavailable (not a network partition that resolves itself).
+- Capture the last consistent LSN from the replica for RPO measurement.
+
+### Steps
+
+1. **Stop replication on the secondary:**
+   ```bash
+   psql $DATABASE_URL_SECONDARY -c "SELECT pg_promote();"
+   ```
+   This converts the read-only replica into a standalone read/write primary.
+
+2. **Verify promotion:**
+   ```bash
+   psql $DATABASE_URL_SECONDARY -c "SELECT pg_is_in_recovery();"
+   # Must return 'f' (false) — not in recovery mode
+   ```
+
+3. **Update application configuration:**
+   - Set `DATABASE_URL` in the secondary region's environment to point to the newly promoted primary.
+   - Deploy the config change or trigger a secret rotation if using a secret manager.
+
+4. **Record RPO:**
+   ```bash
+   # Check if any transactions were lost by comparing WAL positions
+   psql $DATABASE_URL_SECONDARY -c "
+     SELECT now() - pg_postmaster_start_time() AS uptime;
+   "
+   ```
+
+5. **Validate write capability:**
+   ```bash
+   curl -sf -X POST https://api.revora.io/health/startup | jq .
+   ```
+
+### Stale Replica Promotion (Edge Case)
+
+If the replica is significantly behind (e.g., hours due to a network issue):
+
+1. **Assess RPO impact:** Determine if the lag represents acceptable data loss.
+2. **If lag is acceptable:** Proceed with promotion despite the gap — the `idempotency-store warm-up` will catch duplicate deliveries.
+3. **If lag is NOT acceptable:** 
+   - Option A: Wait for replica to catch up (if primary is expected to return soon).
+   - Option B: Restore from the latest WAL archive backup and replay to the latest safe point.
+   - Option C: Accept the stale promotion and reconcile after failover using payout drift detection.
+
+**Security note:** A stale replica may serve outdated idempotency keys. The warm-up procedure in the next section is designed to detect and purge stale entries.
+
+---
+
+## Idempotency-Store Warm-Up
+
+**RTO contribution:** ~3 minutes  
+**Risk:** Duplicate webhook deliveries if idempotency keys are stale
+
+The idempotency store (Redis or in-memory map) in the secondary region starts cold. Warm it up to prevent duplicate webhook processing.
+
+### Steps
+
+1. **Seed from the database:**
+   ```sql
+   SELECT idempotency_key, created_at
+   FROM webhook_deliveries
+   WHERE created_at > NOW() - INTERVAL '24 hours'
+   ORDER BY created_at DESC;
+   ```
+
+2. **Load into the idempotency cache:**
+   ```bash
+   ./scripts/drill-multi-region-failover.sh --warm-idempotency
+   ```
+   This loads the last 24 hours of idempotency keys into the cache.
+
+3. **Verify coverage:**
+   ```bash
+   redis-cli -h $REDIS_HOST KEYS "idempotency:*" | wc -l
+   # Should match the count from step 1
+   ```
+
+4. **Set a TTL on all warm-up entries:**
+   ```redis
+   # Each key should get TTL = 24h - age_of_key
+   # So older keys expire sooner
+   ```
+   This prevents stale keys from accumulating after the warm-up.
+
+### Partial DNS Propagation (Edge Case)
+
+During DNS propagation, some clients may still hit the old primary region while others reach the new secondary. Idempotency keys may be written to either store. The warm-up procedure must handle this:
+
+- The application should check both the local cache and the database for idempotency keys.
+- After full propagation (2x TTL), the reverse-proxy in the old region should return 503 to drain remaining connections.
+
+---
+
+## Traffic Drain
+
+**RTO contribution:** ~2 minutes  
+**Risk:** In-flight requests are lost or duplicated
+
+1. **Drain the primary ALB:**
+   ```bash
+   aws elbv2 modify-target-group-attributes \
+     --target-group-arn PRIMARY_TARGET_GROUP \
+     --attributes Key=deregistration_delay.timeout_seconds,Value=30
+   ```
+
+2. **Monitor in-flight requests drain to zero:**
+   ```bash
+   aws elbv2 describe-target-health \
+     --target-group-arn PRIMARY_TARGET_GROUP \
+     --query 'TargetHealthDescriptions[].TargetHealth.State'
+   ```
+
+3. **After drain is complete:** Stop the primary application processes to prevent split-brain writes.
+   ```bash
+   ssh primary-host "systemctl stop revora-backend"
+   ```
+
+---
+
+## Rollback Path
+
+If the secondary region fails or the primary region recovers within the RTO window:
+
+### Rollback Triggers
+
+| Condition | Action |
+|-----------|--------|
+| Secondary health checks fail within 5 min of cutover | Roll back immediately |
+| Primary region is confirmed operational within RTO window | Roll back |
+| Data inconsistency detected during warm-up | Roll back and investigate |
+
+### Steps
+
+1. **Reverse the DNS cut:**
+   ```bash
+   aws route53 change-resource-record-sets \
+     --hosted-zone-id ZONE_ID \
+     --change-batch '{
+       "Changes": [{
+         "Action": "UPSERT",
+         "ResourceRecordSet": {
+           "Name": "api.revora.io",
+           "Type": "A",
+           "SetIdentifier": "primary",
+           "Failover": "PRIMARY",
+           "AliasTarget": {
+             "HostedZoneId": "PRIMARY_ALB_ZONE",
+             "DNSName": "PRIMARY_ALB_DNS",
+             "EvaluateTargetHealth": true
+           }
+         }
+       }]
+     }'
+   ```
+
+2. **Restore the original primary database (no re-promotion needed if it was never corrupted):**
+   - Verify the primary is healthy: `curl -sf https://api.revora.io/health/startup`
+   - If the primary was corrupted, restore from the latest WAL archive backup.
+
+3. **Point the secondary back as a replica:**
+   ```bash
+   # On the secondary, re-initiate streaming replication
+   psql $DATABASE_URL_SECONDARY -c "
+     SELECT pg_create_physical_replication_slot('secondary');
+   "
+   # Then restart PostgreSQL with primary_conninfo pointing back to the primary
+   ```
+
+4. **Re-run the health verification:**
+   ```bash
+   ./scripts/drill-multi-region-failover.sh --all
+   ```
+
+---
+
+## Contact Rotation
+
+On-call schedule for region-failover decisions:
+
+| Role | Responsibility | Primary | Secondary |
+|------|---------------|---------|-----------|
+| Incident Commander | Declares failover, approves DNS cut | Platform Lead | Backend Lead |
+| DB Operator | Executes replica promotion | DB Admin (pg) | Backend Lead |
+| Network Operator | Executes DNS cut | DevOps/SRE | Platform Lead |
+| Communications | Notifies stakeholders | PM | Engineering Manager |
+
+**Escalation:** If the primary contact does not respond within 5 minutes, escalate to the secondary contact.
+
+---
+
+## Quarterly Game-Day Drill Checklist
+
+Each quarter, run a full failover drill and check off each step.
+
+### Pre-Drill
+
+- [ ] Schedule the drill with stakeholders 2 weeks in advance.
+- [ ] Verify the drill environment is isolated from production traffic (use a staging or mirrored region pair).
+- [ ] Confirm the secondary region has a recent snapshot of production data.
+- [ ] Review the RTO/RPO targets: `RTO=15m`, `RPO=5m`.
+- [ ] Ensure all on-call contacts are available during the drill window.
+
+### Drill Execution
+
+1. **Simulate primary outage:**
+   - [ ] Stop the primary region application: `ssh primary-host "systemctl stop revora-backend"`
+   - [ ] Stop the primary database: `ssh primary-db "systemctl stop postgresql"`
+
+2. **Verify detection:**
+   - [ ] Confirm `/health` returns 503 for the primary region.
+   - [ ] Confirm the secondary replica lag is within acceptable bounds.
+
+3. **Execute failover:**
+   - [ ] Run DNS cut procedure (Section 3).
+   - [ ] Promote replica to primary (Section 4).
+   - [ ] Warm up idempotency store (Section 5).
+
+4. **Verify secondary:**
+   - [ ] `/health` returns 200 on `api.revora.io`.
+   - [ ] Write a test record to the promoted DB.
+   - [ ] Verify idempotency cache coverage > 99% of recent keys.
+
+5. **Measure:**
+   - [ ] Record total failover time (target: < 15 min).
+   - [ ] Record RPO (data loss window, target: < 5 min).
+
+6. **Rollback:**
+   - [ ] Restore primary region services.
+   - [ ] Reverse DNS cut.
+   - [ ] Re-establish replication.
+   - [ ] Verify full health on primary.
+
+### Post-Drill
+
+- [ ] Log the drill outcome to `docs/runbooks/drill-log-multi-region-failover.csv`.
+- [ ] File a post-mortem if any target was missed.
+- [ ] Update this runbook with any procedural improvements discovered.
+
+---
+
+## Drill Outcome Tracking
+
+Drill outcomes are logged to `docs/runbooks/drill-log-multi-region-failover.csv`.
+
+| Date | Tester | RTO (min) | RPO (min) | Pass/Fail | Notes |
+|------|--------|-----------|-----------|-----------|-------|
+
+Each row records the date, the engineer who drove the drill, the measured RTO and RPO, whether the drill passed both targets, and any notes or follow-up actions.
+
+---
+
+## Related Code
+
+| File | Purpose |
+|------|---------|
+| `src/db/client.ts` | Database connection pool and health check |
+| `src/routes/health.ts` | Health endpoints used for failover detection |
+| `src/config/env.ts` | Environment configuration including `DATABASE_URL` |
+| `src/middleware/idempotency.ts` | Idempotency middleware that keys off request hashes |
+| `scripts/drill-multi-region-failover.sh` | Automated drill script for failover verification |
diff --git a/scripts/drill-multi-region-failover.sh b/scripts/drill-multi-region-failover.sh
new file mode 100755
index 00000000..85360d30
--- /dev/null
+++ b/scripts/drill-multi-region-failover.sh
@@ -0,0 +1,217 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Multi-Region Failover Drill Script
+# Verifies DNS resolution, replica health, and idempotency-store readiness.
+# Designed to be run as part of a quarterly game-day drill.
+#
+# Usage:
+#   ./scripts/drill-multi-region-failover.sh [--check-replica|--warm-idempotency|--all|--help]
+#
+# ENV variables required:
+#   PRIMARY_DNS      - Primary region DNS name (default: api.revora.io)
+#   SECONDARY_DNS    - Secondary region DNS name (default: api-eu.revora.io)
+#   DATABASE_URL     - Primary database connection string
+#   DATABASE_URL_SECONDARY - Secondary database connection string
+#   REDIS_HOST       - Redis host for idempotency cache (optional)
+#   RTO_TARGET       - Recovery Time Objective in seconds (default: 900)
+#   RPO_TARGET       - Recovery Point Objective in seconds (default: 300)
+
+: "${PRIMARY_DNS:=api.revora.io}"
+: "${SECONDARY_DNS:=api-eu.revora.io}"
+: "${RTO_TARGET:=900}"
+: "${RPO_TARGET:=300}"
+
+START_EPOCH=$(date +%s)
+FAILURES=0
+
+log_pass()  { echo "[PASS] $1"; }
+log_fail()  { echo "[FAIL] $1"; FAILURES=$((FAILURES + 1)); }
+log_info()  { echo "[INFO] $1"; }
+
+check_dns() {
+  local dns_name=$1 label=$2
+
+  log_info "Resolving $label ($dns_name)..."
+  if result=$(dig +short "$dns_name" @8.8.8.8 2>/dev/null | head -1); then
+    if [[ -n "$result" ]]; then
+      log_pass "$label resolves to $result"
+    else
+      log_fail "$label did not resolve to any address"
+    fi
+  else
+    log_fail "dig failed for $label"
+  fi
+}
+
+check_health() {
+  local url=$1 label=$2
+
+  log_info "Checking health at $label ($url)..."
+  local http_code
+  http_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$url" 2>/dev/null || echo "000")
+  if [[ "$http_code" == "200" ]]; then
+    log_pass "$label health endpoint returned 200"
+  elif [[ "$http_code" == "503" ]]; then
+    log_fail "$label health endpoint returned 503 (service degraded/unhealthy)"
+  else
+    log_fail "$label health endpoint returned HTTP $http_code (expected 200)"
+  fi
+}
+
+check_db_replica() {
+  log_info "Checking secondary replica health..."
+
+  if [[ -z "${DATABASE_URL_SECONDARY:-}" ]]; then
+    log_fail "DATABASE_URL_SECONDARY is not set — cannot check replica"
+    return
+  fi
+
+  if psql "$DATABASE_URL_SECONDARY" -c "SELECT 1" >/dev/null 2>&1; then
+    log_pass "Secondary database accepts connections"
+  else
+    log_fail "Secondary database connection failed"
+    return
+  fi
+
+  read -r is_in_recovery <<< "$(psql "$DATABASE_URL_SECONDARY" -At -c "SELECT pg_is_in_recovery();" 2>/dev/null || echo "unknown")"
+  if [[ "$is_in_recovery" == "t" ]]; then
+    log_pass "Secondary is in recovery (streaming replica mode)"
+  elif [[ "$is_in_recovery" == "f" ]]; then
+    log_info "Secondary is NOT in recovery (already promoted or standalone)"
+  else
+    log_fail "Could not determine secondary recovery state (got: $is_in_recovery)"
+  fi
+
+  local lag_bytes
+  lag_bytes=$(psql "$DATABASE_URL_SECONDARY" -At -c "
+    SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn());
+  " 2>/dev/null || echo "0")
+  if [[ "$lag_bytes" =~ ^[0-9]+$ ]] && [[ "$lag_bytes" -le 104857600 ]]; then
+    log_pass "Secondary replica lag is ${lag_bytes} bytes (≤ 100 MB threshold)"
+  elif [[ "$lag_bytes" =~ ^[0-9]+$ ]]; then
+    log_fail "Secondary replica lag is ${lag_bytes} bytes (exceeds 100 MB threshold)"
+  fi
+}
+
+warm_idempotency() {
+  log_info "Warming idempotency store..."
+
+  if [[ -z "${DATABASE_URL:-}" ]]; then
+    log_fail "DATABASE_URL is not set — cannot seed idempotency keys"
+    return
+  fi
+
+  if [[ -z "${REDIS_HOST:-}" ]]; then
+    log_info "REDIS_HOST not set — simulating warm-up via local cache file"
+    local cache_file="/tmp/idempotency-warmup-$(date +%s).txt"
+    psql "$DATABASE_URL" -At -c "
+      SELECT idempotency_key
+      FROM webhook_deliveries
+      WHERE created_at > NOW() - INTERVAL '24 hours'
+      ORDER BY created_at DESC;
+    " > "$cache_file" 2>/dev/null || touch "$cache_file"
+
+    local count
+    count=$(wc -l < "$cache_file")
+    if [[ "$count" -gt 0 ]]; then
+      log_pass "Seeded $count idempotency keys from the database"
+    else
+      log_info "No recent idempotency keys found (table may be empty or schema different)"
+    fi
+    rm -f "$cache_file"
+    return
+  fi
+
+  # Redis-backed warm-up
+  local total_keys=0
+  psql "$DATABASE_URL" -At -c "
+    SELECT idempotency_key
+    FROM webhook_deliveries
+    WHERE created_at > NOW() - INTERVAL '24 hours'
+    ORDER BY created_at DESC;
+  " 2>/dev/null | while IFS= read -r key; do
+    if [[ -n "$key" ]]; then
+      redis-cli -h "$REDIS_HOST" SETEX "idempotency:${key}" 86400 "1" >/dev/null 2>&1 || true
+      total_keys=$((total_keys + 1))
+    fi
+  done
+  log_pass "Warmed up $total_keys idempotency keys in Redis"
+}
+
+measure_rto() {
+  local elapsed
+  elapsed=$(($(date +%s) - START_EPOCH))
+  log_info "Elapsed time: ${elapsed}s (RTO target: ${RTO_TARGET}s)"
+  if [[ "$elapsed" -le "$RTO_TARGET" ]]; then
+    log_pass "RTO of ${elapsed}s is within target (${RTO_TARGET}s)"
+  else
+    log_fail "RTO of ${elapsed}s exceeds target (${RTO_TARGET}s)"
+  fi
+}
+
+run_all() {
+  log_info "=== Multi-Region Failover Drill ==="
+  log_info "Start time: $(date -u)"
+  log_info ""
+
+  check_dns "$PRIMARY_DNS" "Primary DNS"
+  check_dns "$SECONDARY_DNS" "Secondary DNS"
+  echo ""
+
+  check_health "https://${PRIMARY_DNS}/health" "Primary"
+  check_health "https://${SECONDARY_DNS}/health" "Secondary"
+  echo ""
+
+  check_db_replica
+  echo ""
+
+  warm_idempotency
+  echo ""
+
+  measure_rto
+
+  echo ""
+  if [[ "$FAILURES" -eq 0 ]]; then
+    log_pass "All checks passed."
+  else
+    log_fail "$FAILURES check(s) failed. Review output above."
+  fi
+  echo "=== Drill Complete ==="
+  exit "$FAILURES"
+}
+
+show_help() {
+  cat <<EOF
+Multi-Region Failover Drill Script
+
+Usage: $0 [OPTION]
+
+Options:
+  --check-replica      Verify secondary replica health (DB connection, lag, recovery state)
+  --warm-idempotency   Seed the idempotency cache from recent webhook_deliveries
+  --all                Run all checks in sequence (default)
+  --help               Display this help and exit
+
+Environment:
+  PRIMARY_DNS              Primary region DNS (default: api.revora.io)
+  SECONDARY_DNS            Secondary region DNS (default: api-eu.revora.io)
+  DATABASE_URL             Primary DB connection string
+  DATABASE_URL_SECONDARY   Secondary DB connection string
+  REDIS_HOST               Redis host for idempotency cache
+  RTO_TARGET               RTO target in seconds (default: 900)
+  RPO_TARGET               RPO target in seconds (default: 300)
+EOF
+  exit 0
+}
+
+case "${1:---all}" in
+  --check-replica)    check_db_replica ;;
+  --warm-idempotency) warm_idempotency ;;
+  --all)              run_all ;;
+  --help|-h)          show_help ;;
+  *)
+    echo "Unknown option: $1"
+    show_help
+    ;;
+esac
diff --git a/src/config/env.ts b/src/config/env.ts
index a982d0d1..bb79a306 100644
--- a/src/config/env.ts
+++ b/src/config/env.ts
@@ -32,6 +32,8 @@ import { z } from "zod";
  * | SMTP_PORT                   | SMTP     | 587                     | SMTP relay port                                  |
  * | SMTP_USER                   | No       | (empty)                 | SMTP username; sent only after STARTTLS          |
  * | SMTP_PASS                   | No       | (empty)                 | SMTP password; sent only after STARTTLS          |
+ * | REGION                      | No       | us-east-1               | Current region identifier for multi-region setup |
+ * | FAILOVER_ACTIVE_REGION      | No       | (REGION value)          | Region currently serving as active failover      |
  */
 
 const envSchema = z.object({
@@ -56,6 +58,8 @@ const envSchema = z.object({
   AUDIT_RETENTION_DAYS: z.coerce.number().int().positive().default(90),
   EMAIL_PROVIDER: z.enum(["sendgrid", "smtp", "mock"]).optional(),
   FROM_EMAIL: z.string().email().optional(),
+  REGION: z.string().default("us-east-1"),
+  FAILOVER_ACTIVE_REGION: z.string().optional(),
   SENDGRID_API_KEY: z.string().optional(),
   SMTP_HOST: z.string().optional(),
   SMTP_PORT: z.coerce.number().int().positive().max(65535).optional(),
diff --git a/src/index.ts b/src/index.ts
index 085edcdc..9f06ab1d 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -572,7 +572,21 @@ export function createApp(dependencies: AppDependencies = {}): express.Express {
     });
   });
 
-  app.use("/health", createHealthRouter(healthQuery as any, healthStatus));
+  app.get("/health/failover", async (_req: Request, res: Response) => {
+    const region = env.REGION;
+    const activeRegion = env.FAILOVER_ACTIVE_REGION ?? region;
+    const db = await healthStatus();
+    res.status(db.healthy ? 200 : 503).json({
+      region,
+      activeRegion,
+      isActive: region === activeRegion,
+      db: db.healthy ? "up" : "down",
+      failoverActive: region !== activeRegion,
+      timestamp: new Date().toISOString(),
+    });
+  });
+
+  app.use("/health", createHealthRouter(healthQuery as any, healthStatus, undefined, env.REGION));
 
   apiRouter.get("/overview", (_req: Request, res: Response) => {
     res.json({
diff --git a/src/routes/health.test.ts b/src/routes/health.test.ts
index 60f867fc..c0230fb5 100644
--- a/src/routes/health.test.ts
+++ b/src/routes/health.test.ts
@@ -16,6 +16,7 @@ import {
   createHealthRouter,
   healthLiveHandler,
   healthReadyHandler,
+  healthRegionHandler,
   healthRootHandler,
   healthStartupHandler,
   mapHealthDependencyFailure,
@@ -1265,6 +1266,112 @@ describe("healthRootHandler - dependency graph aggregation", () => {
     expect(dbCheck.dependsOn).toContain("db-pool");
   });
 
+  it("returns region info from healthRegionHandler", async () => {
+    const app = express();
+    app.get("/region", healthRegionHandler("eu-west-1"));
+
+    const response = await request(app).get("/region");
+
+    expect(response.status).toBe(200);
+    expect(response.body.region).toBe("eu-west-1");
+    expect(response.body.activeRegion).toBe("eu-west-1");
+    expect(response.body.isActive).toBe(true);
+    expect(response.body.service).toBe("revora-backend");
+    expect(response.body.timestamp).toBeDefined();
+  });
+
+  it("healthRegionHandler defaults to us-east-1 when no region provided", async () => {
+    const app = express();
+    app.get("/region", healthRegionHandler());
+
+    const response = await request(app).get("/region");
+
+    expect(response.body.region).toBe("us-east-1");
+    expect(response.body.isActive).toBe(true);
+  });
+
+  it("healthRegionHandler reports inactive when region mismatch", async () => {
+    process.env.FAILOVER_ACTIVE_REGION = "eu-west-1";
+    const app = express();
+    app.get("/region", healthRegionHandler("us-east-1"));
+
+    const response = await request(app).get("/region");
+
+    expect(response.body.region).toBe("us-east-1");
+    expect(response.body.activeRegion).toBe("eu-west-1");
+    expect(response.body.isActive).toBe(false);
+
+    delete process.env.FAILOVER_ACTIVE_REGION;
+  });
+
+  it("failover endpoint returns failover status from createApp", async () => {
+    process.env.REGION = "eu-west-1";
+    process.env.FAILOVER_ACTIVE_REGION = "eu-west-1";
+    const app = createApp({
+      healthStatus: jest.fn().mockResolvedValue({
+        healthy: true,
+        latencyMs: 5,
+        pool: { totalCount: 2, idleCount: 2, waitingCount: 0, maxConnections: 10 },
+      }),
+      healthQuery: jest.fn(),
+    });
+
+    const response = await request(app).get("/health/failover");
+
+    expect(response.status).toBe(200);
+    expect(response.body.region).toBe("eu-west-1");
+    expect(response.body.activeRegion).toBe("eu-west-1");
+    expect(response.body.isActive).toBe(true);
+    expect(response.body.failoverActive).toBe(false);
+    expect(response.body.db).toBe("up");
+
+    delete process.env.REGION;
+    delete process.env.FAILOVER_ACTIVE_REGION;
+  });
+
+  it("failover endpoint reports failoverActive=true when region mismatch", async () => {
+    const originalRegion = process.env.REGION;
+    process.env.REGION = "us-east-1";
+    process.env.FAILOVER_ACTIVE_REGION = "eu-west-1";
+    const app = createApp({
+      healthStatus: jest.fn().mockResolvedValue({
+        healthy: true,
+        latencyMs: 5,
+        pool: { totalCount: 2, idleCount: 2, waitingCount: 0, maxConnections: 10 },
+      }),
+      healthQuery: jest.fn(),
+    });
+
+    const response = await request(app).get("/health/failover");
+
+    expect(response.status).toBe(200);
+    expect(response.body.region).toBe("us-east-1");
+    expect(response.body.activeRegion).toBe("eu-west-1");
+    expect(response.body.isActive).toBe(false);
+    expect(response.body.failoverActive).toBe(true);
+
+    if (originalRegion) process.env.REGION = originalRegion;
+    else delete process.env.REGION;
+    delete process.env.FAILOVER_ACTIVE_REGION;
+  });
+
+  it("failover endpoint returns 503 when db is down", async () => {
+    const app = createApp({
+      healthStatus: jest.fn().mockResolvedValue({
+        healthy: false,
+        latencyMs: 100,
+        error: "connection refused",
+        pool: { totalCount: 0, idleCount: 0, waitingCount: 0, maxConnections: 10 },
+      }),
+      healthQuery: jest.fn(),
+    });
+
+    const response = await request(app).get("/health/failover");
+
+    expect(response.status).toBe(503);
+    expect(response.body.db).toBe("down");
+  });
+
   it("returns 503 unhealthy when both DB and Horizon are down", async () => {
     const mockDbHealth = jest.fn().mockResolvedValue({
       healthy: false,
diff --git a/src/routes/health.ts b/src/routes/health.ts
index ed6ab793..c6c5a81a 100644
--- a/src/routes/health.ts
+++ b/src/routes/health.ts
@@ -609,16 +609,32 @@ export const healthReadyHandler =
     }
   };
 
+export const healthRegionHandler =
+  (region?: string) =>
+  async (_req: Request, res: Response): Promise<void> => {
+    const currentRegion = region ?? process.env.REGION ?? "us-east-1";
+    const activeRegion = process.env.FAILOVER_ACTIVE_REGION ?? currentRegion;
+    res.status(200).json({
+      region: currentRegion,
+      activeRegion,
+      isActive: currentRegion === activeRegion,
+      service: "revora-backend",
+      timestamp: new Date().toISOString(),
+    });
+  };
+
 export const createHealthRouter = (
   db: QueryableDb,
   dbHealth: DbHealthChecker,
   metrics?: MetricsCollector,
+  region?: string,
 ): Router => {
   const router = Router();
   router.get("/", healthRootHandler(dbHealth));
   router.get("/live", healthLiveHandler());
   router.get("/ready", healthReadyHandler(db, metrics));
   router.get("/startup", healthStartupHandler(dbHealth));
+  router.get("/region", healthRegionHandler(region));
   return router;
 };