diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 76acf952..fa261da4 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -128,6 +128,34 @@ jobs:
           retention-days: 7
           if-no-files-found: ignore
 
+  # ---------- Orca Integration Tests ----------
+  # Spins up LocalStack and Azurite via testcontainers-go and runs the
+  # orca in-process integration suite (internal/orca/inttest). Docker
+  # is preinstalled on GitHub-hosted Ubuntu runners; no extra services:
+  # block is required.
+  orca-inttest:
+    name: Orca Integration Tests
+    needs: [frontend]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Download frontend dist
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
+        with:
+          name: frontend-dist
+          path: internal/net/html/dist
+
+      - name: Set up Go
+        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        with:
+          go-version-file: go.mod
+          cache-dependency-path: go.sum
+
+      - name: Run orca-inttest
+        run: make orca-inttest
+
   # ---------- Build ----------
   build:
     name: Build
diff --git a/Makefile b/Makefile
index 5be64f18..1c0134c8 100644
--- a/Makefile
+++ b/Makefile
@@ -74,6 +74,14 @@ STAMP_LDFLAGS=-X github.com/Azure/unbounded/internal/version.Version=$(VERSION)
 
 METALMAN_IMAGE=$(CONTAINER_REGISTRY)/metalman:$(VERSION)
 
+# Orca configuration
+ORCA_BIN=bin/orca
+ORCA_CMD=./cmd/orca
+ORCA_IMAGE ?= $(CONTAINER_REGISTRY)/orca:$(VERSION)
+ORCA_NAMESPACE ?= unbounded-kube
+ORCA_MANIFEST_TEMPLATES_DIR := deploy/orca
+ORCA_MANIFEST_RENDERED_DIR  := deploy/orca/rendered
+
 # kubectl-unbounded also stamps the metalman image reference.
 KUBECTL_UNBOUNDED_LDFLAGS=$(STAMP_LDFLAGS) -X github.com/Azure/unbounded/cmd/kubectl-unbounded/app.MetalmanImage=$(METALMAN_IMAGE)
 
@@ -112,6 +120,7 @@ REACT_DEV ?= false
 .PHONY: all help fmt lint test build vulncheck check-deps kubectl-unbounded kubectl-unbounded-build install-tools install-protoc generate kubectl-unbounded forge unbounded-agent machina machina-build machina-oci machina-oci-push machina-manifests machine-ops-controller machine-ops-controller-build machine-ops-controller-oci machine-ops-controller-oci-push machine-ops-manifests metalman metalman-build metalman-oci metalman-oci-push gomod docs-serve unbounded-net-controller unbounded-net-node unbounded-net-routeplan-debug unping unroute notice notice-check
 .PHONY: net-frontend net-frontend-clean net-build-ebpf net-manifests release-manifests
 .PHONY: image-machina-local image-machine-ops-controller-local image-metalman-local image-net-controller-local image-net-node-local images-local
+.PHONY: orca orca-build orca-manifests orca-oci orca-oci-push orca-up orca-down orca-reset orca-inttest image-orca-local
 
 ##@ General
 
@@ -176,6 +185,8 @@ help: ## Show this help
 	@echo "  machina-oci-push                 Build machina image and push"
 	@echo "  machine-ops-controller-oci-push  Build machine-ops-controller image and push"
 	@echo "  metalman-oci-push                Build metalman image and push"
+	@echo "  image-orca-local                 Build orca image"
+	@echo "  orca-oci-push                    Build orca image and push"
 	@echo ""
 	@echo "Net Frontend:"
 	@echo "  net-frontend                     Build frontend into \$$(NET_FRONTEND_DIST_DIR) (cached)"
@@ -188,10 +199,19 @@ help: ## Show this help
 	@echo "  machina-manifests                Render machina manifests into deploy/machina/rendered"
 	@echo "  machine-ops-manifests            Render machine-ops manifests into deploy/machine-ops/rendered"
 	@echo "  net-manifests                    Render net manifests into \$$(NET_MANIFEST_RENDERED_DIR)"
+	@echo "  orca-manifests                   Render orca manifests into deploy/orca/rendered"
 	@echo ""
 	@echo "Net Kubernetes (apply to current kubectl context):"
 	@echo "  See \`make -C hack/net help\` for cluster deploy/undeploy targets."
 	@echo ""
+	@echo "Orca Dev Harness (Kind cluster):"
+	@echo "  orca | orca-build                Build orca binary (with/without lint/test)"
+	@echo "  orca-up                          Bring up Orca dev harness in Kind"
+	@echo "  orca-down                        Tear down Orca dev harness Kind cluster"
+	@echo "  orca-reset                       Rebuild image and rollout-restart deployment"
+	@echo "  orca-inttest                     Run orca integration tests (Docker required)"
+	@echo "  See \`make -C hack/orca help\` for full list."
+	@echo ""
 	@echo "Documentation:"
 	@echo "  docs-serve                       Start local Hugo dev server"
 	@echo ""
@@ -570,6 +590,58 @@ metalman-oci: image-metalman-local ## Alias for image-metalman-local
 metalman-oci-push: metalman-oci ## Build and push the metalman container image
 	$(CONTAINER_ENGINE) push $(METALMAN_IMAGE)
 
+##@ Orca
+
+orca-build: ## Build the orca binary (no lint/test)
+	$(GOBUILD) -ldflags '$(STAMP_LDFLAGS)' -o $(ORCA_BIN) $(ORCA_CMD)/main.go
+
+orca: test orca-build ## Build the orca binary (implies test)
+
+orca-manifests: ## Render orca deployment manifests into deploy/orca/rendered
+	@mkdir -p $(ORCA_MANIFEST_RENDERED_DIR)
+	@find $(ORCA_MANIFEST_RENDERED_DIR) -mindepth 1 -not -name .gitignore -delete 2>/dev/null || true
+	$(GOCMD) run ./hack/cmd/render-manifests \
+		--templates-dir $(ORCA_MANIFEST_TEMPLATES_DIR) \
+		--output-dir $(ORCA_MANIFEST_RENDERED_DIR) \
+		--set Namespace=$(ORCA_NAMESPACE) \
+		--set Image=$(ORCA_IMAGE)
+	@echo "Rendered orca manifests into $(ORCA_MANIFEST_RENDERED_DIR) (image: $(ORCA_IMAGE))"
+
+image-orca-local: ## Build the orca container image locally (single-arch)
+	$(CONTAINER_ENGINE) build \
+		--build-arg VERSION=$(VERSION) \
+		--build-arg GIT_COMMIT=$(GIT_COMMIT) \
+		--build-arg BUILD_TIME=$(BUILD_TIME) \
+		-t orca:$(VERSION) -t $(ORCA_IMAGE) \
+		-f ./images/orca/Containerfile .
+
+orca-oci: image-orca-local ## Alias for image-orca-local
+
+orca-oci-push: orca-oci ## Build and push the orca container image
+	$(CONTAINER_ENGINE) push $(ORCA_IMAGE)
+
+# Dev-cluster proxy targets. The actual implementations live in
+# hack/orca/Makefile (see AGENTS.md convention; mirrors hack/net/).
+orca-up: ## Bring up the Orca dev harness in a Kind cluster
+	$(MAKE) -C hack/orca up
+
+orca-down: ## Tear down the Orca dev harness Kind cluster
+	$(MAKE) -C hack/orca down
+
+orca-reset: ## Rebuild orca image and rolling-restart the dev deployment
+	$(MAKE) -C hack/orca reset
+
+# orca-inttest mirrors the test/test-race pattern: race detector in CI
+# (ubuntu-latest has gcc), no -race locally so developers without a C
+# toolchain can still run integration tests.
+ifdef CI
+orca-inttest: ## Run orca integration tests (LocalStack + Azurite via testcontainers; requires Docker)
+	$(GOTEST) -tags=integrationtest -race -timeout 15m ./internal/orca/inttest/...
+else
+orca-inttest: ## Run orca integration tests (LocalStack + Azurite via testcontainers; requires Docker)
+	$(GOTEST) -tags=integrationtest -timeout 15m ./internal/orca/inttest/...
+endif
+
 image-net-controller-local: net-frontend resources/cni-plugins-linux-$(HOST_GOARCH)-$(CNI_PLUGINS_VERSION).tgz ## Build the unbounded-net-controller image locally (single-arch)
 	$(CONTAINER_ENGINE) build \
 		--target controller \
diff --git a/cmd/orca/main.go b/cmd/orca/main.go
new file mode 100644
index 00000000..f7ea8484
--- /dev/null
+++ b/cmd/orca/main.go
@@ -0,0 +1,10 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package main
+
+import "github.com/Azure/unbounded/cmd/orca/orca"
+
+func main() {
+	orca.Run()
+}
diff --git a/cmd/orca/orca/orca.go b/cmd/orca/orca/orca.go
new file mode 100644
index 00000000..48ac19ae
--- /dev/null
+++ b/cmd/orca/orca/orca.go
@@ -0,0 +1,134 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package orca wires the Orca cache binary together. It is invoked by
+// cmd/orca/main.go and is responsible for parsing flags, loading the
+// YAML config, and delegating to internal/orca/app for actual runtime
+// wiring.
+package orca
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"os"
+	"os/signal"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/spf13/cobra"
+
+	"github.com/Azure/unbounded/internal/orca/app"
+	"github.com/Azure/unbounded/internal/orca/config"
+)
+
+// Run is the entrypoint invoked by cmd/orca/main.go.
+func Run() {
+	root := &cobra.Command{
+		Use:   "orca",
+		Short: "Orca origin cache - S3-compatible read-only cache fronting Azure / S3 origins",
+	}
+	root.AddCommand(newServeCmd())
+
+	if err := root.Execute(); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+}
+
+func newServeCmd() *cobra.Command {
+	var configPath string
+
+	cmd := &cobra.Command{
+		Use:   "serve",
+		Short: "Run the Orca cache server",
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return serve(cmd.Context(), configPath)
+		},
+	}
+	cmd.Flags().StringVarP(&configPath, "config", "c", "/etc/orca/config.yaml",
+		"path to YAML config file")
+
+	return cmd
+}
+
+func serve(parent context.Context, configPath string) error {
+	cfg, err := config.Load(configPath)
+	if err != nil {
+		return fmt.Errorf("load config: %w", err)
+	}
+
+	level, err := resolveLogLevel(cfg.Logging.Level)
+	if err != nil {
+		return err
+	}
+
+	levelVar := new(slog.LevelVar)
+	levelVar.Set(level)
+
+	log := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
+		Level:     levelVar,
+		AddSource: true,
+	}))
+	slog.SetDefault(log)
+
+	log.Info("orca starting",
+		"config_path", configPath,
+		"log_level", level.String(),
+	)
+
+	log.Info("config loaded",
+		"origin_id", cfg.Origin.ID,
+		"replicas_target", cfg.Cluster.TargetReplicas,
+		"target_global", cfg.Origin.TargetGlobal,
+		"internal_tls", cfg.Cluster.InternalTLS.Enabled,
+		"client_auth", cfg.Server.Auth.Enabled,
+	)
+
+	ctx, cancel := signal.NotifyContext(parent, os.Interrupt, syscall.SIGTERM)
+	defer cancel()
+
+	a, err := app.Start(ctx, cfg, app.WithLogger(log))
+	if err != nil {
+		return err
+	}
+
+	if waitErr := a.Wait(ctx); waitErr != nil {
+		log.Error("listener exited with error", "err", waitErr)
+		cancel()
+	} else {
+		log.Info("shutdown signal received")
+	}
+
+	shutdownCtx, shCancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer shCancel()
+
+	// Propagate Shutdown errors to the process exit code so that
+	// failed-shutdown signals (kubelet probes, init systems) match
+	// reality. App.Shutdown also logs each individual error
+	// internally, so this only governs the exit-code semantics.
+	shutdownErr := a.Shutdown(shutdownCtx)
+
+	log.Info("orca stopped")
+
+	return shutdownErr
+}
+
+// resolveLogLevel determines the effective slog.Level by consulting
+// the ORCA_LOG_LEVEL environment variable first; if unset or empty,
+// falls back to the YAML-configured value. An unrecognised value
+// (from either source) returns a parse error so misconfiguration is
+// surfaced at startup rather than silently degrading to info.
+func resolveLogLevel(yamlLevel string) (slog.Level, error) {
+	if env := strings.TrimSpace(os.Getenv("ORCA_LOG_LEVEL")); env != "" {
+		level, err := config.ParseLogLevel(env)
+		if err != nil {
+			return 0, fmt.Errorf("ORCA_LOG_LEVEL: %w", err)
+		}
+
+		return level, nil
+	}
+
+	return config.ParseLogLevel(yamlLevel)
+}
diff --git a/cmd/orca/orca/orca_test.go b/cmd/orca/orca/orca_test.go
new file mode 100644
index 00000000..ca3c3352
--- /dev/null
+++ b/cmd/orca/orca/orca_test.go
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package orca
+
+import (
+	"log/slog"
+	"testing"
+)
+
+// TestResolveLogLevel_PrecedenceAndDefault covers the resolution
+// order documented on resolveLogLevel: ORCA_LOG_LEVEL wins when
+// set and non-empty (after trim), otherwise the YAML-configured
+// value is used, otherwise the empty string defaults through
+// config.ParseLogLevel to info.
+func TestResolveLogLevel_PrecedenceAndDefault(t *testing.T) {
+	tests := []struct {
+		name      string
+		yamlLevel string
+		envLevel  string // "" -> simulate unset via Setenv with ""
+		want      slog.Level
+		wantErr   bool
+	}{
+		{"empty yaml, no env -> info", "", "", slog.LevelInfo, false},
+		{"yaml info, no env", "info", "", slog.LevelInfo, false},
+		{"yaml debug, no env", "debug", "", slog.LevelDebug, false},
+		{"yaml info overridden by env debug", "info", "debug", slog.LevelDebug, false},
+		{"yaml debug overridden by env warn", "debug", "warn", slog.LevelWarn, false},
+		{"whitespace env falls back to yaml", "warn", "   ", slog.LevelWarn, false},
+		{"invalid yaml fails", "trace", "", 0, true},
+		{"invalid env fails even when yaml valid", "info", "trace", 0, true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Setenv("ORCA_LOG_LEVEL", tt.envLevel)
+
+			got, err := resolveLogLevel(tt.yamlLevel)
+			if tt.wantErr {
+				if err == nil {
+					t.Errorf("resolveLogLevel(%q) = %v, want error", tt.yamlLevel, got)
+				}
+
+				return
+			}
+
+			if err != nil {
+				t.Errorf("resolveLogLevel(%q) unexpected err: %v", tt.yamlLevel, err)
+				return
+			}
+
+			if got != tt.want {
+				t.Errorf("resolveLogLevel(yaml=%q, env=%q) = %v, want %v",
+					tt.yamlLevel, tt.envLevel, got, tt.want)
+			}
+		})
+	}
+}
diff --git a/deploy/orca/01-namespace.yaml.tmpl b/deploy/orca/01-namespace.yaml.tmpl
new file mode 100644
index 00000000..fd353a35
--- /dev/null
+++ b/deploy/orca/01-namespace.yaml.tmpl
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
diff --git a/deploy/orca/02-rbac.yaml.tmpl b/deploy/orca/02-rbac.yaml.tmpl
new file mode 100644
index 00000000..5961196b
--- /dev/null
+++ b/deploy/orca/02-rbac.yaml.tmpl
@@ -0,0 +1,8 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: orca
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
diff --git a/deploy/orca/03-config.yaml.tmpl b/deploy/orca/03-config.yaml.tmpl
new file mode 100644
index 00000000..26ac7f82
--- /dev/null
+++ b/deploy/orca/03-config.yaml.tmpl
@@ -0,0 +1,74 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: orca-config
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+data:
+  config.yaml: |
+    # Orca origin cache configuration.
+    # Secret values (account keys, S3 access/secret) are sourced from
+    # environment variables ORCA_AZUREBLOB_ACCOUNT_KEY,
+    # ORCA_CACHESTORE_S3_ACCESS_KEY, ORCA_CACHESTORE_S3_SECRET_KEY,
+    # populated by the orca-credentials Secret via envFrom.
+
+    server:
+      listen: "0.0.0.0:8443"
+      auth:
+        # Dev: disabled. Production: enable bearer or mtls.
+        enabled: {{ default "false" .ServerAuthEnabled }}
+
+    origin:
+      id: {{ default "azureblob-default" .OriginID | quote }}
+      driver: {{ default "azureblob" .OriginDriver }}
+      target_global: {{ default "192" .TargetGlobal }}
+      queue_timeout: 5s
+      retry:
+        attempts: 3
+        backoff_initial: 100ms
+        backoff_max: 2s
+        max_total_duration: 5s
+      azureblob:
+        account: {{ default "" .AzureAccount | quote }}
+        container: {{ default "" .AzureContainer | quote }}
+        endpoint: {{ default "" .AzureEndpoint | quote }}
+      awss3:
+        endpoint: {{ default "" .OriginAWSS3Endpoint | quote }}
+        region: {{ default "us-east-1" .OriginAWSS3Region | quote }}
+        bucket: {{ default "" .OriginAWSS3Bucket | quote }}
+        use_path_style: {{ default "false" .OriginAWSS3UsePathStyle }}
+
+    cachestore:
+      driver: s3
+      s3:
+        endpoint: {{ default "http://localstack.unbounded-kube.svc.cluster.local:4566" .CachestoreEndpoint | quote }}
+        bucket: {{ default "orca-cache" .CachestoreBucket | quote }}
+        region: {{ default "us-east-1" .CachestoreRegion | quote }}
+        use_path_style: true
+
+    cluster:
+      service: {{ default "orca-peers.unbounded-kube.svc.cluster.local" .ClusterService | quote }}
+      membership_refresh: 5s
+      internal_listen: "0.0.0.0:8444"
+      target_replicas: {{ default "3" .TargetReplicas }}
+      internal_tls:
+        # Dev: disabled (plain HTTP/2 between peers). Production: true.
+        enabled: {{ default "false" .InternalTLSEnabled }}
+
+    chunk_catalog:
+      max_entries: 100000
+
+    metadata:
+      ttl: 5m
+      negative_ttl: 60s
+      max_entries: 10000
+
+    chunking:
+      size: 8388608
+
+    logging:
+      # One of debug, info, warn, error. Overridden at runtime by the
+      # ORCA_LOG_LEVEL environment variable when set.
+      level: {{ default "info" .LogLevel | quote }}
diff --git a/deploy/orca/04-deployment.yaml.tmpl b/deploy/orca/04-deployment.yaml.tmpl
new file mode 100644
index 00000000..d2f11397
--- /dev/null
+++ b/deploy/orca/04-deployment.yaml.tmpl
@@ -0,0 +1,91 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: orca
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+spec:
+  replicas: {{ default "3" .TargetReplicas }}
+  # Required pod-anti-affinity below pins one Orca pod per node.
+  # In the dev harness the worker count == replica count, so default
+  # RollingUpdate can't surge: the new pod has no node to land on.
+  # maxSurge=0 / maxUnavailable=1 walks the replicas one-at-a-time.
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0
+      maxUnavailable: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: orca
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: orca
+    spec:
+      serviceAccountName: orca
+      # Required anti-affinity: at most one Orca pod per node so that a
+      # single node failure does not knock out multiple replicas. The
+      # dev harness Kind cluster has 3 worker nodes to match the default
+      # 3 replicas.
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            - labelSelector:
+                matchLabels:
+                  app.kubernetes.io/name: orca
+              topologyKey: kubernetes.io/hostname
+      containers:
+        - name: orca
+          image: {{ default "ghcr.io/azure/orca:latest" .Image | quote }}
+          imagePullPolicy: {{ default "IfNotPresent" .ImagePullPolicy }}
+          args:
+            - serve
+            - --config=/etc/orca/config.yaml
+          env:
+            - name: POD_IP
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+          envFrom:
+            - secretRef:
+                name: orca-credentials
+          ports:
+            - containerPort: 8443
+              name: edge
+              protocol: TCP
+            - containerPort: 8444
+              name: internal
+              protocol: TCP
+            - containerPort: 8442
+              name: ops
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: ops
+            initialDelaySeconds: 5
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /readyz
+              port: ops
+            initialDelaySeconds: 2
+            periodSeconds: 5
+          resources:
+            requests:
+              cpu: {{ default "200m" .ResourceCPURequest }}
+              memory: {{ default "256Mi" .ResourceMemoryRequest }}
+            limits:
+              cpu: {{ default "2" .ResourceCPULimit }}
+              memory: {{ default "1Gi" .ResourceMemoryLimit }}
+          volumeMounts:
+            - name: config
+              mountPath: /etc/orca
+              readOnly: true
+      volumes:
+        - name: config
+          configMap:
+            name: orca-config
diff --git a/deploy/orca/05-service.yaml.tmpl b/deploy/orca/05-service.yaml.tmpl
new file mode 100644
index 00000000..36dba4fd
--- /dev/null
+++ b/deploy/orca/05-service.yaml.tmpl
@@ -0,0 +1,43 @@
+---
+# Client-facing Service: standard ClusterIP. Clients of the cache (e.g.
+# tools speaking S3 to fetch objects) connect here. Kube-proxy load
+# balances across the 3 replicas.
+apiVersion: v1
+kind: Service
+metadata:
+  name: orca
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+spec:
+  type: ClusterIP
+  selector:
+    app.kubernetes.io/name: orca
+  ports:
+    - name: edge
+      port: 8443
+      targetPort: edge
+      protocol: TCP
+
+---
+# Peer-discovery Service: headless (ClusterIP: None). LookupHost on
+# orca-peers.<ns>.svc.cluster.local returns all pod IPs, enabling
+# rendezvous-hash coordination among Orca replicas.
+apiVersion: v1
+kind: Service
+metadata:
+  name: orca-peers
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+spec:
+  type: ClusterIP
+  clusterIP: None
+  publishNotReadyAddresses: true
+  selector:
+    app.kubernetes.io/name: orca
+  ports:
+    - name: internal
+      port: 8444
+      targetPort: internal
+      protocol: TCP
diff --git a/deploy/orca/dev/01-localstack.yaml.tmpl b/deploy/orca/dev/01-localstack.yaml.tmpl
new file mode 100644
index 00000000..87dfcc02
--- /dev/null
+++ b/deploy/orca/dev/01-localstack.yaml.tmpl
@@ -0,0 +1,83 @@
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: localstack
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: localstack
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  type: ClusterIP
+  selector:
+    app.kubernetes.io/name: localstack
+  ports:
+    - name: edge
+      port: 4566
+      targetPort: 4566
+      protocol: TCP
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: localstack
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: localstack
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: localstack
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: localstack
+        app.kubernetes.io/part-of: orca-dev
+    spec:
+      containers:
+        - name: localstack
+          # 3.8 is community-tier; 'latest' became Pro-only and exits
+          # with code 55 ("License activation failed").
+          image: {{ default "localstack/localstack:3.8" .LocalstackImage | quote }}
+          imagePullPolicy: IfNotPresent
+          ports:
+            - containerPort: 4566
+              name: edge
+              protocol: TCP
+          env:
+            - name: SERVICES
+              value: s3
+            - name: DEBUG
+              value: "0"
+            - name: PERSISTENCE
+              value: "0"
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 1
+              memory: 1Gi
+          readinessProbe:
+            httpGet:
+              path: /_localstack/health
+              port: 4566
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            timeoutSeconds: 3
+          livenessProbe:
+            httpGet:
+              path: /_localstack/health
+              port: 4566
+            initialDelaySeconds: 30
+            periodSeconds: 30
+            timeoutSeconds: 5
+          volumeMounts:
+            - name: data
+              mountPath: /var/lib/localstack
+      volumes:
+        - name: data
+          emptyDir: {}
diff --git a/deploy/orca/dev/02-init-job.yaml.tmpl b/deploy/orca/dev/02-init-job.yaml.tmpl
new file mode 100644
index 00000000..41285369
--- /dev/null
+++ b/deploy/orca/dev/02-init-job.yaml.tmpl
@@ -0,0 +1,81 @@
+---
+# Init Job: creates the cachestore + origin S3 buckets in LocalStack so
+# that Orca can pass the versioningGate boot check and so that reviewers
+# have an origin bucket to seed sample objects into. Idempotent:
+# CreateBucket returns BucketAlreadyOwnedByYou on rerun, swallowed by
+# the script.
+#
+# Cachestore bucket: versioning left unset (the driver unconditionally
+# refuses to start against a versioned bucket since If-None-Match: *
+# is not honored on versioned buckets).
+# Origin bucket: no versioning constraint; sample objects live here.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: orca-buckets-init
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  backoffLimit: 6
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: orca
+        app.kubernetes.io/part-of: orca-dev
+    spec:
+      restartPolicy: OnFailure
+      containers:
+        - name: aws-cli
+          image: {{ default "amazon/aws-cli:latest" .AwsCliImage | quote }}
+          env:
+            - name: AWS_ACCESS_KEY_ID
+              value: test
+            - name: AWS_SECRET_ACCESS_KEY
+              value: test
+            - name: AWS_DEFAULT_REGION
+              value: us-east-1
+            - name: CACHESTORE_BUCKET
+              value: {{ default "orca-cache" .CachestoreBucket | quote }}
+            - name: ORIGIN_BUCKET
+              value: {{ default "orca-origin" .OriginBucket | quote }}
+            - name: ENDPOINT
+              value: http://localstack.{{ default "unbounded-kube" .Namespace }}.svc.cluster.local:4566
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -e
+              echo "Waiting for LocalStack at $ENDPOINT ..."
+              for i in $(seq 1 60); do
+                if aws --endpoint-url "$ENDPOINT" s3api list-buckets >/dev/null 2>&1; then
+                  echo "LocalStack ready."
+                  break
+                fi
+                sleep 2
+              done
+
+              ensure_bucket() {
+                bucket="$1"
+                echo "Ensuring bucket $bucket (idempotent) ..."
+                if aws --endpoint-url "$ENDPOINT" s3api head-bucket --bucket "$bucket" >/dev/null 2>&1; then
+                  echo "Bucket $bucket already exists."
+                else
+                  aws --endpoint-url "$ENDPOINT" s3api create-bucket --bucket "$bucket"
+                  echo "Bucket $bucket created."
+                fi
+              }
+
+              ensure_bucket "$CACHESTORE_BUCKET"
+              ensure_bucket "$ORIGIN_BUCKET"
+
+              # Verify cachestore bucket versioning is unset (Orca's
+              # versioningGate rejects Enabled or Suspended).
+              status=$(aws --endpoint-url "$ENDPOINT" s3api get-bucket-versioning --bucket "$CACHESTORE_BUCKET" --query Status --output text 2>/dev/null || echo "None")
+              echo "Cachestore bucket versioning: $status (None means unset, which is required)."
+              if [ "$status" = "Enabled" ] || [ "$status" = "Suspended" ]; then
+                echo "ERROR: cachestore bucket versioning is $status; Orca requires unset/None."
+                exit 1
+              fi
+              echo "Init complete."
diff --git a/deploy/orca/dev/03-azurite.yaml.tmpl b/deploy/orca/dev/03-azurite.yaml.tmpl
new file mode 100644
index 00000000..e70209e8
--- /dev/null
+++ b/deploy/orca/dev/03-azurite.yaml.tmpl
@@ -0,0 +1,117 @@
+---
+# Azurite is Microsoft's official Azure Storage emulator. We use it as
+# an alternative origin in the dev harness so reviewers can exercise
+# the azureblob origin driver path without a real Azure account.
+#
+# Well-known dev account/key (documented at
+# https://learn.microsoft.com/azure/storage/common/storage-use-azurite):
+#   AccountName: devstoreaccount1
+#   AccountKey:  Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==
+#   BlobURL:     http://azurite.<ns>.svc.cluster.local:10000/devstoreaccount1
+apiVersion: v1
+kind: Service
+metadata:
+  name: azurite
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: azurite
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  # NodePort so the host-side seeder tool (hack/cmd/orcaseed) can
+  # reach Azurite without a kubectl port-forward. Kind binds node
+  # ports to the host's loopback, so the seeder talks to
+  # http://localhost:<nodePort>/devstoreaccount1/. The fixed port
+  # (default 30100) sits in the Kubernetes NodePort range
+  # (30000-32767). Two concurrent dev clusters on the same host
+  # would collide; override via AzuriteNodePort in the renderer
+  # invocation if you run more than one.
+  type: NodePort
+  selector:
+    app.kubernetes.io/name: azurite
+  ports:
+    - name: blob
+      port: 10000
+      targetPort: 10000
+      nodePort: {{ default "30100" .AzuriteNodePort }}
+      protocol: TCP
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: azurite
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: azurite
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: azurite
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: azurite
+        app.kubernetes.io/part-of: orca-dev
+    spec:
+      containers:
+        - name: azurite
+          image: {{ default "mcr.microsoft.com/azure-storage/azurite:3.33.0" .AzuriteImage | quote }}
+          imagePullPolicy: IfNotPresent
+          # Bind to 0.0.0.0 so the Service can reach it; default is
+          # 127.0.0.1.
+          # --skipApiVersionCheck allows newer Azure SDK clients
+          # (which advertise API versions Azurite hasn't yet caught up
+          # with) to talk to it.
+          # --loose disables strict validation of newer SDK headers.
+          # --disableProductStyleUrl forces path-style URL parsing.
+          # Without it, Azurite parses the first DNS label of the Host
+          # header as the account name (so requests to azurite.<ns>...
+          # would be misinterpreted as account="azurite" rather than
+          # account="devstoreaccount1").
+          # --debug routes Azurite's internal request log to a file;
+          # tail it via `kubectl exec ... -- cat /tmp/azurite-debug.log`
+          # when triaging 4xx responses.
+          args:
+            - azurite-blob
+            - --blobHost
+            - 0.0.0.0
+            - --blobPort
+            - "10000"
+            - --skipApiVersionCheck
+            - --loose
+            - --disableProductStyleUrl
+            - --debug
+            - /tmp/azurite-debug.log
+            - --location
+            - /data
+          ports:
+            - containerPort: 10000
+              name: blob
+              protocol: TCP
+          resources:
+            requests:
+              cpu: 50m
+              memory: 128Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          readinessProbe:
+            tcpSocket:
+              port: 10000
+            initialDelaySeconds: 3
+            periodSeconds: 5
+            timeoutSeconds: 3
+          livenessProbe:
+            tcpSocket:
+              port: 10000
+            initialDelaySeconds: 30
+            periodSeconds: 30
+            timeoutSeconds: 5
+          volumeMounts:
+            - name: data
+              mountPath: /data
+      volumes:
+        - name: data
+          emptyDir: {}
diff --git a/deploy/orca/dev/04-azurite-init.yaml.tmpl b/deploy/orca/dev/04-azurite-init.yaml.tmpl
new file mode 100644
index 00000000..8ad9433f
--- /dev/null
+++ b/deploy/orca/dev/04-azurite-init.yaml.tmpl
@@ -0,0 +1,54 @@
+---
+# Init Job: creates the Azure container in Azurite so Orca's azureblob
+# origin driver has somewhere to read from. Idempotent: az container
+# create with --fail-on-exist false treats existence as success.
+#
+# Uses the well-known Azurite dev creds (devstoreaccount1 + the
+# documented public key); these are baked into Azurite and not
+# secrets.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: orca-azurite-container-init
+  namespace: {{ default "unbounded-kube" .Namespace }}
+  labels:
+    app.kubernetes.io/name: orca
+    app.kubernetes.io/part-of: orca-dev
+spec:
+  backoffLimit: 6
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: orca
+        app.kubernetes.io/part-of: orca-dev
+    spec:
+      restartPolicy: OnFailure
+      containers:
+        - name: az-cli
+          image: {{ default "mcr.microsoft.com/azure-cli:latest" .AzCliImage | quote }}
+          env:
+            - name: AZURE_STORAGE_CONNECTION_STRING
+              value: "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite.{{ default "unbounded-kube" .Namespace }}.svc.cluster.local:10000/devstoreaccount1;"
+            - name: CONTAINER
+              value: {{ default "orca-test" .AzuriteContainer | quote }}
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -e
+              echo "Waiting for Azurite ..."
+              for i in $(seq 1 60); do
+                if az storage container list --output none 2>/dev/null; then
+                  echo "Azurite ready."
+                  break
+                fi
+                sleep 2
+              done
+              echo "Ensuring container ${CONTAINER} (idempotent) ..."
+              if az storage container exists --name "${CONTAINER}" --query exists --output tsv | grep -qi true; then
+                echo "Container ${CONTAINER} already exists."
+              else
+                az storage container create --name "${CONTAINER}" --output none
+                echo "Container ${CONTAINER} created."
+              fi
+              echo "Init complete."
\ No newline at end of file
diff --git a/deploy/orca/rendered/.gitignore b/deploy/orca/rendered/.gitignore
new file mode 100644
index 00000000..f79c394d
--- /dev/null
+++ b/deploy/orca/rendered/.gitignore
@@ -0,0 +1,3 @@
+# rendered manifests are gitignored; produced by `make orca-manifests`.
+*
+!.gitignore
diff --git a/designs/orca/brief.md b/designs/orca/brief.md
new file mode 100644
index 00000000..db51c35b
--- /dev/null
+++ b/designs/orca/brief.md
@@ -0,0 +1,206 @@
+# Orca - Origin Cache - Architecture Brief
+
+A one-screen orientation: what Orca is, the load-bearing
+decisions, and the risks. For mechanism and flow, see
+[design.md](./design.md).
+
+## 1. Problem and approach
+
+Cloud blob storage (AWS S3, Azure Blob) is slow and expensive
+when many on-prem clients read from it at once. Orca's target
+workload is large immutable artifacts - job inputs, model
+weights, training shards - read by thousands of clients with
+correlated cold starts. Direct cloud access at that scale is a
+cost and latency problem.
+
+Orca is a read-only S3-compatible HTTP cache that sits inside
+the on-prem datacenter as a multi-replica Kubernetes Deployment.
+It fronts AWS S3 and Azure Blob, serves chunked bytes keyed by
+ETag out of a shared in-DC store, and makes sure the same chunk
+is fetched only once no matter how many clients ask for it.
+Clients use the same `GetObject` / `HeadObject` / `ListObjectsV2`
+calls they already use.
+
+## 2. Goals and non-goals
+
+In scope:
+- Read-only S3-compatible API: `GetObject` with `Range`,
+  `HeadObject`, minimal `ListObjectsV2` pass-through.
+- Multi-PB working set; thousands of concurrent clients.
+- One Orca deployment per datacenter, no cross-DC peering.
+- Near-zero origin stampede under correlated cold-access bursts.
+- Fast TTFB on both hits and misses.
+- Atomic, durable commit of fetched chunks.
+- Bounded staleness: at most 5 minutes if an operator overwrites
+  a key in place (`metadata.ttl`), at most 60 seconds for the
+  "uploaded after a 404" case (`metadata.negative_ttl`).
+  Otherwise zero.
+
+Out of scope:
+- Writes, multipart uploads, object versioning.
+- Cross-DC peering.
+- SigV4 verification (bearer / mTLS hooks exist but nothing
+  enforces them yet).
+- Multi-tenant quotas; per-client / per-IP rate limiting.
+- Origin-pushed invalidation (the ETag covers it).
+- Encryption at rest beyond what the backing store provides.
+
+## 3. System at a glance
+
+A client request lands on one replica, the **assembler**. The
+assembler walks the requested byte range chunk by chunk. Hits
+read directly from the shared **CacheStore**. Misses go to the
+chunk's **coordinator** - the one replica a hash on chunk
+identity picks from the headless Service membership. That
+coordinator deduplicates concurrent fetches with a per-`ChunkKey`
+singleflight, calls the **Origin**, and commits to the
+CacheStore in a single no-overwrite write. The coordinator may
+be the same replica as the assembler (local fill) or a different
+one (called over the internal fill RPC).
+
+### Diagram A: System overview
+
+```mermaid
+graph TB
+    subgraph DC["On-prem datacenter"]
+        Clients["Edge clients"]
+        Service["Service (ClusterIP / LB)<br/>client traffic"]
+        subgraph Replicas["orca Deployment"]
+            R1["Replica 1<br/>:8443 edge<br/>:8444 internal<br/>:8442 ops"]
+            R2["Replica 2"]
+            R3["Replica N"]
+        end
+        Headless["Headless Service<br/>peer discovery"]
+        Internal["Internal listener :8444<br/>per-chunk fill RPC"]
+        Ops["Ops :8442<br/>/healthz, /readyz<br/>(kubelet only)"]
+        CS[("CacheStore<br/>in-DC S3-compatible")]
+    end
+    subgraph Cloud["Cloud origins"]
+        S3[("AWS S3")]
+        Azure[("Azure Blob<br/>Block Blobs only")]
+    end
+    Clients -- "S3 GET / HEAD / LIST<br/>+ Range" --> Service
+    Service --> R1
+    Service --> R2
+    Service --> R3
+    R1 -. "DNS refresh<br/>default 5s" .-> Headless
+    R2 -.-> Headless
+    R3 -.-> Headless
+    R1 <--> Internal
+    R2 <--> Internal
+    R3 <--> Internal
+    R1 -.- Ops
+    R2 -.- Ops
+    R3 -.- Ops
+    R1 <--> CS
+    R2 <--> CS
+    R3 <--> CS
+    R1 -- "miss-fill<br/>If-Match: etag" --> S3
+    R2 -- "miss-fill<br/>If-Match: etag" --> S3
+    R3 -- "miss-fill<br/>If-Match: etag" --> Azure
+```
+
+## 4. Five load-bearing mechanisms
+
+### 4.1 Chunking and identity
+
+Objects are split into fixed-size chunks (8 MiB by default,
+tunable). A chunk's name (`ChunkKey`) is
+`{origin_id, bucket, object_key, etag, chunk_size, chunk_index}`,
+and that name deterministically becomes the chunk's storage
+path. The ETag is the key's identity: a new ETag means a new
+path, so Orca cannot serve old bytes for a new ETag by
+construction. Empty-ETag origin responses are rejected at
+`Head`.
+
+The chunk size is not fixed. For bigger objects the edge picks a
+bigger chunk size (8 MiB up to 128 MiB by default, see
+`chunking.tiers`), so the per-object request count stays
+manageable. The edge also fetches the next few chunks in
+parallel while sending the current one to the client
+(`chunking.readahead`, default 8). Both knobs help large-blob
+throughput without changing how chunks are stored or addressed.
+
+### 4.2 Singleflight + commit-after-serve
+
+The coordinator's singleflight collapses many concurrent misses
+for the same chunk into a single origin fetch. The leader retries
+transient origin errors up to 3 times in 5 seconds before sending
+any client headers, releases joiners as soon as the chunk is in
+memory and length-checked, and commits to the cachestore in
+parallel. A commit failure is invisible to the client: the chunk
+just isn't recorded and the next request refills.
+
+### 4.3 Per-chunk coordinator (rendezvous hashing)
+
+Each replica polls the headless Service for peer IPs every 5
+seconds and uses a rendezvous hash on chunk identity to pick one
+coordinator per chunk. The assembler calls coordinators over the
+internal listener (`:8444`, plain HTTP in dev). One client
+request that spans N chunks can hit N different coordinators -
+that's how Orca spreads hot chunks. Stale routes during
+membership churn are caught by an `X-Orca-Internal: 1` header
+plus a self-check on the receiver; a mismatch returns 409 and
+the caller falls back to filling locally.
+
+### 4.4 Atomic-commit primitive
+
+The leader publishes a chunk to the CacheStore in one write that
+won't overwrite. `cachestore/s3` uses `PutObject +
+If-None-Match: *`; the loser of a race gets 412 and is recorded
+as `ErrCommitLost`. At boot the driver runs two checks - a
+self-test that proves the precondition is honored, and a
+versioning gate that refuses to start on versioned buckets
+(several S3-compatible backends ignore `If-None-Match: *` on
+them).
+
+### 4.5 Bounded staleness contract
+
+Operators promise: once a key is published, its bytes never
+change. To change the data, publish a new key. As long as the
+promise holds, Orca cannot serve stale bytes (the ETag is in
+the chunk's path). If the promise is broken, Orca may serve old
+bytes for up to 5 minutes (`metadata.ttl`). That's the
+load-bearing correctness statement and must appear in
+consumer-API docs. Every `Origin.GetRange` also carries
+`If-Match: <etag>` as a safety net. A matching bound applies to
+the "uploaded after a 404" case: 60 seconds
+(`metadata.negative_ttl`) per replica that saw the original 404.
+
+## 5. Backing-store options
+
+One driver ships today:
+
+- `cachestore/s3` - an in-DC S3-compatible object store (VAST in
+  production, LocalStack in dev). Atomic-commit primitive is
+  `PutObject + If-None-Match: *`; the boot self-test and the
+  versioning gate keep it honest.
+
+Shared-POSIX-filesystem drivers (`cachestore/posixfs`,
+`cachestore/localfs`) were designed and not built. See
+[design.md s13](./design.md#13-deferred--future-work).
+
+## 6. Top risks
+
+| Risk | What goes wrong | Bound | Detail |
+|---|---|---|---|
+| Immutable-origin promise | Operator overwrites a key instead of publishing a new one | Up to 5 min stale (`metadata.ttl`) | [s9](./design.md#9-bounded-staleness-contract) |
+| Empty-ETag origin | Two versions share a storage path; corrupt reads | Rejected at `Head`; 502 `OriginMissingETag` | [s2](./design.md#2-decisions) |
+| Commit-after-serve failure | Client got bytes; cachestore commit failed | Chunk unrecorded; next request refills. Debug logs only today | [s7.7](./design.md#77-failure-handling-without-re-stampede) |
+| Approximate origin cap | Scale changes mis-size the cluster-wide cap | Mirror replica count into `cluster.target_replicas` | [s13](./design.md#13-deferred--future-work) |
+| Create-after-404 staleness | Upload after a 404 reached a client | Up to 60s per replica (`metadata.negative_ttl`) | [s10](./design.md#10-create-after-404-and-negative-cache-lifecycle) |
+| Auth stubbed | Bearer / mTLS hooks not enforced | Rely on NetworkPolicy until built | [s13](./design.md#13-deferred--future-work) |
+
+## 7. Where to go next
+
+`design.md` for the full picture:
+
+- [s2 Decisions](./design.md#2-decisions)
+- [s3 Terminology](./design.md#3-terminology)
+- [s4 Architecture](./design.md#4-architecture)
+- [s7 Stampede protection](./design.md#7-stampede-protection)
+- [s8 Atomic commit](./design.md#8-atomic-commit)
+- [s9 Bounded staleness contract](./design.md#9-bounded-staleness-contract)
+- [s10 Create-after-404](./design.md#10-create-after-404-and-negative-cache-lifecycle)
+- [s11 Eviction and capacity](./design.md#11-eviction-and-capacity)
+- [s13 Deferred / future work](./design.md#13-deferred--future-work)
diff --git a/designs/orca/design.md b/designs/orca/design.md
new file mode 100644
index 00000000..4c597d4c
--- /dev/null
+++ b/designs/orca/design.md
@@ -0,0 +1,1290 @@
+# Orca - Origin Cache - Design
+
+What Orca does, how it does it, and the few decisions that keep it
+correct under load. The shorter stakeholder version is in
+[brief.md](./brief.md).
+
+## Table of contents
+
+1. [Overview](#1-overview)
+2. [Decisions](#2-decisions)
+3. [Terminology](#3-terminology)
+4. [Architecture](#4-architecture)
+5. [Chunk model](#5-chunk-model)
+6. [Request flow](#6-request-flow)
+7. [Stampede protection](#7-stampede-protection)
+8. [Atomic commit](#8-atomic-commit)
+9. [Bounded staleness contract](#9-bounded-staleness-contract)
+10. [Create-after-404 and negative-cache lifecycle](#10-create-after-404-and-negative-cache-lifecycle)
+11. [Eviction and capacity](#11-eviction-and-capacity)
+12. [Horizontal scale](#12-horizontal-scale)
+13. [Deferred / future work](#13-deferred--future-work)
+
+---
+
+## 1. Overview
+
+Clients inside an on-prem datacenter need to read large files
+that live in cloud blob storage (AWS S3, Azure Blob). Letting
+every client read from the cloud directly costs too much,
+adds too much latency, and pushes too much traffic across the
+security boundary.
+
+Orca sits inside the datacenter and reads from cloud storage on
+the clients' behalf. It speaks an S3-compatible HTTP API, so
+clients use the same SDKs they already use. On a cache hit it
+serves from a shared in-DC store. On a miss it fetches from the
+cloud, saves the result, and returns it.
+
+Orca splits each object into fixed-size chunks (8 MiB by
+default). Each chunk's storage path is a hash of the object's
+identity (origin, bucket, key, ETag, chunk size). Orca runs as a
+multi-replica Kubernetes Deployment. The replicas share one
+in-DC store. They find each other through a headless Service.
+For any given chunk a single hash picks one replica as the
+chunk's "coordinator" - the only replica that's allowed to
+fetch that chunk from the cloud. The other replicas ask the
+coordinator over a private channel. The result: even if a
+thousand clients ask for the same chunk at the same time, the
+cloud sees exactly one fetch.
+
+## 2. Decisions
+
+| Area | Decision |
+|---|---|
+| Client API | S3-compatible HTTP. `GET` + `HEAD` + a minimal `ListObjectsV2` pass-through. Range reads work. |
+| Auth surface | Bearer / mTLS hooks exist on the edge and the internal listener, but nothing checks them yet. Dev runs with auth off. See s4 and [Deferred / future work](#13-deferred--future-work). |
+| Origins | AWS S3 and Azure Blob, behind a pluggable `Origin` interface. |
+| Azure constraint | Block Blobs only. Page and Append blobs are rejected at `Head` with `UnsupportedBlobTypeError`. |
+| Cachestore | An in-DC S3-compatible store (`cachestore/s3`): LocalStack in dev, VAST or similar in production. Treated as the truth for what chunks exist. |
+| Atomic commit | `PutObject` with `If-None-Match: *`. The second concurrent commit gets a `412` and is recorded as `ErrCommitLost`. At boot, `SelfTestAtomicCommit` proves the backend honors the precondition; if it doesn't, the process refuses to start. |
+| Versioned cachestore buckets | Not supported. At boot, `GetBucketVersioning` runs; if the bucket has versioning enabled or suspended, the process refuses to start. VAST and several S3-compatible backends ignore `If-None-Match: *` on versioned buckets, which would silently break the atomic-commit rule. |
+| Chunking | Default 8 MiB (`chunking.size`). For bigger objects, an optional tier ladder (`chunking.tiers`) picks a larger size: 64 MiB for objects over 1 GiB, 128 MiB for objects over 10 GiB. The chunk size is part of the chunk's storage path, so changing the default or any tier never breaks existing data. Minimum 1 MiB. |
+| Read-ahead | While the edge sends one chunk to the client, it can fetch the next few chunks in parallel. The default is 8 in flight. Set `chunking.readahead: 0` to turn it off. |
+| Consistency | Operators promise: once a key is published, its bytes never change. To change the data, publish a new key. Orca treats the ETag as the key's identity, not as a freshness check. We also send `If-Match: <etag>` on every fetch as a safety net. If an operator breaks the promise, the wrong data is served for at most 5 minutes (`metadata.ttl`). If a key is uploaded after someone already saw a 404 on it, the wrong 404 is served for at most 60 seconds (`metadata.negative_ttl`). See [s9](#9-bounded-staleness-contract). |
+| ETag presence | The origin must return a non-empty ETag on `Head`. If it doesn't, Orca rejects the response with `origin.MissingETagError`. Without an ETag, two different versions of the same `(bucket, key)` would hash to the same storage path and Orca would silently serve old bytes. |
+| Catalog | An in-memory LRU (`ChunkCatalog`) that remembers which chunks are in the cachestore. Presence-only - no size or access count. Capped at 100,000 entries by default. |
+| Cluster | Kubernetes Deployment + headless Service for peer discovery + ClusterIP / LB for client traffic. A hash on the chunk's identity picks one replica as the chunk's coordinator. The replica that received the client request - the **assembler** - asks the right coordinator for each chunk in the range. On hits, any replica can read the cachestore directly. |
+| Internal-listener auth | Config keys exist for mTLS, but nothing enforces them yet. Dev runs with mTLS off. |
+| Origin concurrency cap | Each replica caps in-flight origin fetches at `floor(origin.target_global / cluster.target_replicas)` - 64 by default. When the origin throttles (503, 429, retryable 5xx), the leader retries with exponential backoff before sending any HTTP headers, so the client never sees the throttle. |
+| Tenancy | One tenant, one set of origin credentials. |
+| Listeners | Three: edge `:8443`, internal-fill `:8444`, ops `:8442` (`/healthz`, `/readyz`). All plain HTTP in dev. |
+| Repo home | This repo. Code under `internal/orca/`, manifests under `deploy/orca/`, dev harness under `hack/orca/`. |
+
+## 3. Terminology
+
+- **Replica** - one running pod of the `orca` Deployment. Replicas
+  are interchangeable; they hold only in-memory caches.
+- **Client** - whoever is calling the S3-compatible HTTP API.
+- **Origin** - the upstream cloud store (AWS S3 or Azure Blob).
+  Orca only reads from it. Interface in
+  `internal/orca/origin/origin.go`.
+- **CacheStore** - the shared in-DC chunk store. The truth for
+  what's cached. Today this is `cachestore/s3` (an in-DC
+  S3-compatible object store). Interface in
+  `internal/orca/cachestore/cachestore.go`; commit rules in
+  [s8](#8-atomic-commit).
+- **Chunk** - one piece of an object. The size is chosen per
+  request from a small ladder: 8 MiB for small objects, up to 128
+  MiB for objects over 10 GiB by default. Orca caches and fills
+  chunks, not whole objects.
+- **ChunkKey** - the chunk's name:
+  `{origin_id, bucket, object_key, etag, chunk_size, chunk_index}`.
+  See [s5](#5-chunk-model).
+- **Headless Service** - a Kubernetes Service with `clusterIP: None`.
+  Its DNS A-record returns the IPs of all Ready pods. Orca polls
+  it every 5s (default) to learn the current peers.
+- **Rendezvous hashing** (HRW) - for a key, score every peer with
+  `hash(peer_ip || key)` and pick the highest score. Stable when
+  peers come and go: a chunk's owner only changes if its own
+  owner is added or removed. Orca uses this to pick one
+  coordinator per chunk.
+- **Coordinator** - the replica the hash picks to fetch a chunk
+  on a miss. One coordinator per chunk, not per request and not
+  per object.
+- **Assembler** - the replica that took the client request. It
+  walks the requested byte range chunk by chunk. For each chunk
+  it reads from the cachestore on a hit, or asks the chunk's
+  coordinator on a miss (locally or over the internal RPC).
+- **Singleflight** - a small in-process trick: if a fetch for a
+  given chunk is already running, new requests for that chunk
+  wait for the running fetch instead of starting their own. The
+  first arrival is the **leader**; the rest are **joiners**. See
+  [s7.1](#71-per-chunkkey-singleflight).
+- **Per-chunk internal fill RPC** -
+  `GET /internal/fill?<chunk-key params>` over plain HTTP on the
+  internal listener (`:8444` by default). The assembler calls it
+  when the coordinator is some other replica.
+- **Atomic CacheStore commit** - the write that publishes a chunk
+  to the cachestore without overwriting anything. `PutObject` with
+  `If-None-Match: *`. If two replicas race, one wins with `200`
+  and the other gets `412` (recorded as `ErrCommitLost`).
+- **Immutable-origin contract** - operators promise that once
+  they publish a key, its bytes never change. If they break this,
+  Orca may serve the old bytes for up to `metadata.ttl`. See
+  [s9](#9-bounded-staleness-contract).
+- **Pre-header retry** - the leader retries a failed
+  `Origin.GetRange` up to 3 times within 5 seconds before sending
+  any HTTP header to the client. Transient origin failures stay
+  invisible. `OriginETagChangedError` is not retried.
+- **Negative-cache entry** - a metadata-cache entry that
+  remembers a `404`, an `UnsupportedBlobTypeError`, or a
+  `MissingETagError`. Reused for 60 seconds by default
+  (`metadata.negative_ttl`).
+- **S3 versioning gate** - a boot-time `GetBucketVersioning`
+  check. If the cachestore bucket has versioning enabled or
+  suspended, Orca refuses to start.
+- **MissingETagError** - what the fetch coordinator returns when
+  the origin's `Head` response has no ETag. Comes back to the
+  client as a 502 `OriginMissingETag` and is cached negatively.
+
+## 4. Architecture
+
+Orca is a single binary deployed as a Kubernetes Deployment.
+Replicas discover each other through a headless Service and
+refresh the peer list every 5 seconds by default
+(`cluster.membership_refresh`).
+
+A client request lands on one replica, the **assembler**. The
+assembler walks the requested byte range chunk by chunk. For
+each chunk:
+
+- If the chunk is in the cachestore, the assembler reads it
+  directly. Any replica can do this.
+- If not, a hash on the chunk's identity picks the **coordinator**
+  for that chunk. If the coordinator is this replica, the
+  assembler fetches the chunk locally. If it's some other
+  replica, the assembler asks that replica over the internal-fill
+  RPC.
+
+One tenant. One set of origin credentials per deployment.
+
+Each replica runs three HTTP listeners:
+
+- **Edge (`:8443`)** - the S3-compatible client API. Auth is
+  wired in config but not enforced. Dev runs with
+  `server.auth.enabled: false`.
+- **Internal-fill (`:8444`)** - serves `GET /internal/fill`, the
+  RPC between replicas. Plain HTTP in dev
+  (`cluster.internal_tls.enabled: false`).
+- **Ops (`:8442`)** - serves `/healthz` (always 200 while the
+  process is up) and `/readyz` (200 once the cachestore
+  self-test has passed and the cluster has at least one peer-set
+  snapshot). Plain HTTP, no auth. Production manifests point the
+  kubelet probes here; the client Service does not expose this
+  port.
+
+### Diagram 1: System overview
+
+```mermaid
+graph TB
+    subgraph DC["On-prem datacenter"]
+        Clients["Edge clients"]
+        Service["Service (ClusterIP / LB)<br/>client traffic"]
+        subgraph Replicas["orca Deployment"]
+            R1["Replica 1<br/>:8443 edge<br/>:8444 internal<br/>:8442 ops"]
+            R2["Replica 2"]
+            R3["Replica N"]
+        end
+        Headless["Headless Service<br/>peer discovery"]
+        Internal["Internal listener :8444<br/>GET /internal/fill"]
+        Ops["Ops :8442<br/>/healthz, /readyz<br/>(kubelet only)"]
+        CS[("CacheStore<br/>in-DC S3-compatible")]
+    end
+    subgraph Cloud["Cloud origins"]
+        S3[("AWS S3")]
+        Azure[("Azure Blob<br/>Block Blobs only")]
+    end
+    Clients -- "S3 GET / HEAD / LIST<br/>+ Range" --> Service
+    Service --> R1
+    Service --> R2
+    Service --> R3
+    R1 -. "DNS refresh<br/>default 5s" .-> Headless
+    R2 -.-> Headless
+    R3 -.-> Headless
+    R1 <--> Internal
+    R2 <--> Internal
+    R3 <--> Internal
+    R1 -.- Ops
+    R2 -.- Ops
+    R3 -.- Ops
+    R1 <--> CS
+    R2 <--> CS
+    R3 <--> CS
+    R1 -- "miss-fill<br/>If-Match: etag" --> S3
+    R2 -- "miss-fill<br/>If-Match: etag" --> S3
+    R3 -- "miss-fill<br/>If-Match: etag" --> Azure
+```
+
+## 5. Chunk model
+
+A `ChunkKey` is six fields: `{origin_id, bucket, object_key,
+etag, chunk_size, chunk_index}`.
+
+- `origin_id` is a deployment-scoped name from config (e.g.
+  `aws-us-east-1-prod`). Required. Two Orca deployments can share
+  the same cachestore bucket without colliding because their keys
+  start with different `origin_id` values.
+- `etag` makes a key's content explicit. A new ETag means a new
+  logical object: it gets a fresh set of chunks. Old chunks from
+  the old ETag fall out of the cachestore via lifecycle policy
+  (see [s11](#11-eviction-and-capacity)).
+- `chunk_size` is baked into the storage-path hash, so changing
+  it in config never corrupts existing data.
+- `chunk_index = floor(byte / chunk_size)`.
+
+A small metadata cache holds `(origin_id, bucket, key) -> ObjectInfo`
+with two TTLs: 5 minutes for hits, 60 seconds for misses. Without
+it, every request would re-`HEAD` the origin.
+
+Each chunk's storage path is deterministic:
+
+`LE64(x)` is the little-endian 8-byte encoding of a 64-bit unsigned
+integer, `||` is byte-string concatenation, and `LP(s)` is the
+length-prefixed encoding of `s` (its length as `LE64` followed by
+its bytes). Length-prefixing each field prevents two distinct
+inputs from producing the same hash via boundary ambiguity (e.g.
+`("ab", "c")` vs. `("a", "bc")`).
+
+```
+LP(s)   = LE64(uint64(len(s))) || s
+hashKey = sha256(
+            LP(origin_id) ||
+            LP(bucket)    ||
+            LP(key)       ||
+            LP(etag)      ||
+            LE64(chunk_size)
+          )
+path    = "<origin_id>/<hex(hashKey)>/<chunk_index>"
+```
+
+`origin_id` is in the path in the clear (it's not hashed) so an
+operator can delete one deployment's chunks with a single
+`aws s3 rm --recursive <bucket>/<origin_id>/`. `chunk_size` goes
+into the hash, not the path, so changing it doesn't break
+anything visible.
+
+**What happens if you change `chunk_size`.** Nothing bad. Each
+chunk's path is hashed from the chunk size, so old chunks at the
+old size never collide with new chunks at the new size. The old
+chunks just become unreachable. Plan for two things while the
+working set rebuilds at the new size: storage usage roughly
+doubles, and origin traffic spikes briefly. The old chunks age
+out on their own via the bucket's lifecycle policy.
+
+### 5.1 Effective chunk size
+
+Chunk size is not one global number. The edge handler picks it
+per request from a base size plus an optional list of tiers.
+Each tier says "for objects this big and larger, use this chunk
+size." The base covers small objects; tiers kick in at higher
+object sizes.
+
+Default ladder:
+
+| Object size | Chunk size |
+|---|---|
+| under 1 GiB | 8 MiB (base) |
+| 1 GiB to 10 GiB | 64 MiB |
+| over 10 GiB | 128 MiB |
+
+**Why a ladder.** Small objects don't need big chunks - that
+would waste memory per fill. Big objects pay a high price for
+small chunks - more HTTP requests, more per-chunk overhead. The
+ladder picks a size that fits each object.
+
+**Why it's safe to change.** Each chunk's storage path includes
+the chunk size in its hash. So a chunk written at 8 MiB and a
+chunk written at 128 MiB live at different paths and never
+overlap. If you change the ladder, old chunks at the old size
+simply age out via the bucket lifecycle policy. Nothing gets
+corrupted.
+
+**Why tiers can't overlap.** The config requires tiers to be
+sorted by their object-size threshold, with no duplicates. The
+loader rejects anything else. So for any object size there is
+exactly one matching tier (or the base, if no tier matches).
+
+**Cross-replica safety.** The peer-to-peer fill RPC sends the
+chunk size along with every request (see
+[s7.3](#73-cluster-wide-deduplication-via-per-chunk-fill-rpc)).
+If two replicas are running with different tier settings during
+a rolling deploy, every request is still self-contained - the
+receiver uses the size the sender asked for. No coordination is
+needed.
+
+To find a chunk, Orca calls `CacheStore.Stat(key)`. The
+`ChunkCatalog` (an in-memory LRU) remembers recent Stat hits so
+the hot path skips the cachestore. The catalog is a cache for
+the cache: drop it and Orca still works. It stores nothing per
+entry beyond "this path is present", because the path already
+encodes the chunk's exact identity. If the cachestore later
+loses the chunk (e.g. lifecycle deletes it), the next `GetChunk`
+returns `ErrNotFound`, the caller calls `Forget`, and the next
+request re-stats.
+
+For a request `Range: bytes=A-B`:
+
+```
+firstChunk = A / chunk_size
+lastChunk  = B / chunk_size
+for cid := firstChunk; cid <= lastChunk; cid++ {
+    fetchOrServe(cid)
+    sliceWithin(cid, max(A, cid*sz), min(B, (cid+1)*sz - 1))
+}
+```
+
+The loop is streaming: Orca never builds the full list of chunk
+keys up front.
+
+### Diagram 2: Range request -> chunk index mapping
+
+`SizeFor` below is the tier-ladder lookup described in
+[s5.1](#51-effective-chunk-size).
+
+```mermaid
+flowchart LR
+    Req["GET /bucket/key<br/>Range: bytes=A-B"] --> Math["chunk_size = SizeFor(info.Size)<br/>firstChunk = A / chunk_size<br/>lastChunk  = B / chunk_size"]
+    Math --> Iter["streaming iterator<br/>cid := firstChunk..lastChunk"]
+    Iter --> Keys["per cid: ChunkKey =<br/>{origin_id, bucket, key,<br/>etag, chunk_size, cid}"]
+    Keys --> Path["path =<br/>origin_id /<br/>hex(sha256(LP(origin_id) || ...)) /<br/>cid"]
+    Path --> CS[("CacheStore<br/>address")]
+```
+
+## 6. Request flow
+
+A `GET /{bucket}/{key}` arrives, maybe with a `Range` header.
+The edge handler does this:
+
+1. **Get the object's metadata.** Call
+   `fetch.Coordinator.HeadObject`. It first checks the metadata
+   cache. On a miss, the per-replica HEAD singleflight runs
+   `metadata.LookupOrFetch` and calls `Origin.Head` once. An
+   empty `ETag` in the response is rejected as
+   `MissingETagError`. Hits live 5 minutes (`metadata.ttl`);
+   negative cases (`ErrNotFound`, `UnsupportedBlobTypeError`,
+   `MissingETagError`) live 60 seconds (`metadata.negative_ttl`).
+2. **Handle empty objects.** If the object is zero bytes, return
+   200 with an empty body right away. A `Range` header on a
+   zero-byte object is 416.
+3. **Parse and check the range.** Validate any `Range` header
+   against `info.Size`. An unsatisfiable range is 416.
+4. Compute the chunk range with `chunk.IndexRange`.
+5. **Fetch the first chunk before sending any headers.** Call
+   `fc.GetChunk(firstKey, info.Size)`, wrap the reader in a
+   `bufio.Reader`, and `Peek(1)`. If the peek fails - origin
+   unreachable, auth, ETag changed, missing ETag - the handler
+   returns a clean S3-style error without ever sending a 200 /
+   206. Once that first byte is in hand, the handler sends
+   headers (`Content-Length`, optional `Content-Range`, `ETag`,
+   `Content-Type`) and starts streaming.
+6. **Stream chunk by chunk.** Stream the first chunk's slice,
+   then fetch and stream chunks 1..N. If a fetch fails after
+   headers are out, the response just ends mid-body; S3 SDKs
+   notice the Content-Length mismatch and retry.
+7. **For each chunk**, `fc.GetChunk` first checks the catalog and
+   the cachestore. A hit returns a reader clamped to
+   `k.ExpectedLen(info.Size)`. A miss goes to the cluster-wide
+   dedup path
+   ([s7.3](#73-cluster-wide-deduplication-via-per-chunk-fill-rpc)).
+8. **Cold-path fill.** The leader fetches the chunk from the
+   origin with pre-header retry, checks the body length against
+   `ExpectedLen`, buffers it in memory, releases the joiners, and
+   commits to the cachestore in the background (commit-after-
+   serve - see [s7.2](#72-singleflight--commit-after-serve)).
+
+### Diagram 3: Scenario A - warm read (cache hit)
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Client
+    participant R as Replica (assembler)
+    participant Cat as ChunkCatalog
+    participant CS as CacheStore
+    C->>R: GET /bucket/key Range: bytes=A-B
+    R->>R: HeadObject -> info (metadata cache)
+    R->>Cat: Lookup(firstChunk)
+    Cat-->>R: hit
+    R->>CS: GetChunk(firstChunk, 0, expectedLen)
+    CS-->>R: bytes (reader)
+    R->>R: Peek(1)  // origin reachability proxy
+    R-->>C: 200/206 + headers + first slice
+    loop remaining chunks
+        R->>Cat: Lookup(k)
+        Cat-->>R: hit
+        R->>CS: GetChunk(k)
+        CS-->>R: bytes
+        R-->>C: stream slice
+    end
+```
+
+A cache hit. The assembler asks the catalog, reads from the
+cachestore, and streams to the client. No origin call, no peer
+call.
+
+### Diagram 4: Scenario B - cold miss, local coordinator
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Client
+    participant R as Replica (assembler == coordinator)
+    participant SF as Singleflight on R
+    participant O as Origin
+    participant CS as CacheStore
+    participant Cat as ChunkCatalog
+    C->>R: GET /bucket/key Range
+    R->>R: HeadObject -> info
+    R->>R: ChunkCatalog miss, then Stat miss
+    R->>SF: Acquire(k) [leader]
+    SF->>O: GetRange(..., If-Match: etag)<br/>(pre-header retry)
+    O-->>SF: full chunk bytes
+    SF->>SF: validate buf.Len() == ExpectedLen(info.Size)
+    Note over SF: release joiners (close f.done)
+    SF-->>R: bytes (in-memory reader over f.bodyBuf)
+    R->>R: Peek(1), commit headers
+    R-->>C: 200/206 + headers + body
+    par commit-after-serve (async vs joiner reads)
+        SF->>CS: PutChunk(If-None-Match: *)
+        CS-->>SF: 200 (commit_won) or 412 (commit_lost)
+    end
+    alt commit_won
+        SF->>Cat: Record(k)
+    else commit_lost
+        SF->>CS: Stat(k), Record on success
+    end
+```
+
+A cold miss where the same replica is both the assembler and the
+coordinator. The replica fetches from origin, hands the bytes to
+the client, and writes to the cachestore in the background.
+
+### 6.1 HEAD request flow
+
+`HEAD /{bucket}/{key}` is served from object metadata. No chunks
+are touched.
+
+1. The edge handler calls `fc.HeadObject`. A metadata-cache hit
+   returns the cached `ObjectInfo`. A miss runs the per-replica
+   HEAD singleflight, which issues one `Origin.Head`.
+2. On success, return 200 with `Content-Length: info.Size`,
+   `ETag: info.ETag`, `Content-Type: info.ContentType`, and
+   `Accept-Ranges: bytes`.
+3. Errors reuse the GET error mapping (s6.3). A 404 is cached
+   negatively. `UnsupportedBlobTypeError` comes back as a 502
+   `OriginUnsupported`. `MissingETagError` comes back as a 502
+   `OriginMissingETag`. All three are cached negatively.
+
+### 6.2 LIST request flow
+
+`GET /{bucket}/?list-type=2&prefix=...` is a thin pass-through to
+`Origin.List`. The handler pulls `prefix`, `continuation-token`,
+and `max-keys` from the query string, calls the origin, and
+turns the result into a minimal `ListBucketResult` XML body.
+
+This is deliberately narrow. A per-replica LIST cache tuned for
+FUSE `ls` workloads is in scope as future work; see
+[Deferred / future work](#13-deferred--future-work).
+
+### 6.3 HTTP error-code mapping
+
+| Status | S3-style code | Reason | Triggered by | Client retry? |
+|---|---|---|---|---|
+| 200 / 206 | (none) | normal hit or successful fill | hit + range OK; cold-path fill after pre-header-retry commit | n/a |
+| 404 | `NoSuchKey` | origin returned `ErrNotFound` (cached negatively) | edge HEAD / GET miss | no |
+| 416 | (text body) | range vs. `info.Size` violation | range math at request entry; or any `Range` against a zero-byte object | no (different range) |
+| 502 | `OriginUnsupported` | non-BlockBlob azureblob; from `UnsupportedBlobTypeError` (cached negatively) | `Origin.Head` returns an unsupported blob type | no |
+| 502 | `OriginETagChanged` | `OriginETagChangedError` from `Origin.GetRange`; not retried | mid-flight overwrite caught by `If-Match` | yes (next request re-`Head`s) |
+| 502 | `OriginMissingETag` | `MissingETagError` from the fetch coordinator (cached negatively) | origin `Head` returned an empty ETag | no (operator must fix the origin config) |
+| 502 | `Unauthorized origin` | `origin.ErrAuth` | origin returned 401 / 403 | no (operator) |
+| 502 | `OriginUnreachable` | uncategorised origin error (5xx, timeouts past retry budget, DNS) | leader retry budget exhausted; cachestore failure during read | yes (origin may recover) |
+| 503 | (probe response) | replica `NotReady` | `/readyz` failing predicates | n/a (LB drain) |
+| (mid-stream abort) | n/a | post-header failure | origin disconnect, peer 5xx, cachestore failure after `Peek(1)` succeeded | S3 SDKs detect the Content-Length mismatch and retry |
+
+Pre-header errors come back as `http.Error` text. The 416 paths
+do too. There is no per-error S3-style XML envelope yet; S3 SDKs
+accept the text body and route on the HTTP status. Mid-stream
+aborts end the response (HTTP/2 `RST_STREAM` or HTTP/1.1
+`Connection: close`).
+
+### 6.4 Edge read-ahead
+
+The chunk-by-chunk loop in step 6 of the request flow is not
+strictly one-at-a-time. While the edge is sending one chunk to
+the client, it can pull the next few chunks from the cachestore
+at the same time. The default is up to 8 in flight per client
+request.
+
+**Why this matters.** A 700 GiB object at 128 MiB chunks is
+around 5,600 chunks. Without read-ahead, each chunk is fetched,
+then sent, then the next is fetched - one round trip after
+another. With 8 in flight, most of the per-chunk round-trip time
+is hidden behind sending bytes to the client.
+
+**How it works.** The edge starts a small producer that issues
+chunk fetches in order. Each fetch runs in its own worker.
+Results come back in chunk order via a small in-memory queue, so
+the client always receives bytes in the right order even if a
+later worker finishes first.
+
+**What stays the same.** The first chunk is still fetched and
+checked before any response headers go out. If something fails
+on chunk 0 - origin down, missing ETag, anything else - the
+client gets a clean S3-style error, not a partial body.
+Read-ahead only applies to chunks 1..N. Cold fills still go
+through the per-replica origin cap
+([s7.1](#71-per-chunkkey-singleflight)), so the cluster does not
+suddenly issue more origin requests just because read-ahead is
+on. Memory stays bounded by the origin cap.
+
+**What happens on failure.** If a chunk fetch fails after
+headers are out, the response just ends - same as before. If
+the client disconnects, the producer stops and closes any chunk
+bodies it has already pulled, so nothing leaks. If a worker
+panics, it is caught, logged, and reported back to the consumer
+as a fetch error.
+
+**Turning it off.** Set `chunking.readahead: 0` to go back to
+strict one-at-a-time fetching.
+
+## 7. Stampede protection
+
+The hot path. The job here is simple: when many clients ask for
+the same chunk at the same time, the origin should see one
+fetch, not many. Two mechanisms do this together.
+
+1. **Inside one replica:** if a fetch for a chunk is already
+   running, new requests for that chunk wait for the running
+   fetch instead of starting their own. This is the singleflight.
+2. **Across replicas:** a hash on the chunk's identity picks
+   exactly one replica as the coordinator for that chunk. The
+   other replicas ask that one over a private channel. So even
+   across the cluster, only one replica fetches.
+
+The named seams these mechanisms run through:
+
+| Seam | File | Role |
+|---|---|---|
+| `origin.Origin` | `internal/orca/origin/origin.go` (interface); `internal/orca/origin/awss3/`, `internal/orca/origin/azureblob/` | Read-only adapter to the upstream blob store. `If-Match: <etag>` on every `GetRange`. |
+| `cachestore.CacheStore` | `internal/orca/cachestore/cachestore.go` (interface); `internal/orca/cachestore/s3/` | In-DC chunk store; source of truth for chunk presence. `PutChunk` is atomic + no-clobber (returns `ErrCommitLost` on conflict). |
+| `chunkcatalog.Catalog` | `internal/orca/chunkcatalog/chunkcatalog.go` | Bounded in-memory LRU recording chunks known to be in the cachestore. Presence-only. |
+| `cluster.Cluster` | `internal/orca/cluster/cluster.go` | Peer discovery (DNS), rendezvous hashing, internal-fill RPC client + response validator. |
+| `fetch.Coordinator` | `internal/orca/fetch/fetch.go` | Per-replica fill orchestrator. Owns the singleflight, the origin semaphore, and the pre-header retry loop. |
+
+### 7.1 Per-`ChunkKey` singleflight
+
+The fetch coordinator keeps a map of in-flight fills, keyed on
+the chunk's storage path. The map is guarded by a mutex. Each
+entry holds a `done` channel, an error slot, and the buffer the
+leader will fill.
+
+Two cases on entry:
+
+- The map has no entry for this chunk. The caller becomes the
+  leader, inserts a fresh entry, and runs `runFill` in a
+  goroutine.
+- The map already has an entry. The caller is a joiner. It waits
+  on the leader's `done` channel.
+
+Joiners select between their own request context and `<-f.done`.
+On release they either return the leader's error or wrap the
+leader's buffer in a `bytes.Reader` and stream it. The leader
+guarantees the buffer is fully written and length-checked before
+it closes `done`, so joiners never see a half-written buffer.
+
+When `runFill` returns, the leader removes the in-flight entry.
+Any request arriving after that point misses the map. By then
+the chunk should be in the catalog and the request takes the
+hit path.
+
+### 7.2 Singleflight + commit-after-serve
+
+What the leader does in `runFill`:
+
+1. Runs on its own 5-minute context, not the client's. The
+   cachestore commit then finishes even if every caller has
+   walked away. The 5-minute ceiling caps how long a zombie fill
+   can hold resources.
+2. Takes a slot from the per-replica origin semaphore. The
+   semaphore is sized `floor(target_global / target_replicas)`.
+   Waiting more than `origin.queue_timeout` (default 5s) returns
+   an error to the caller.
+3. Calls `Origin.GetRange` through `fetchWithRetry`. The retry
+   loop is 3 attempts within 5 seconds, with exponential backoff
+   capped at 2 seconds. `OriginETagChangedError` and
+   `origin.ErrNotFound` are not retried.
+4. Copies the body into a fresh `bytes.Buffer`.
+5. **Checks the length** against `k.ExpectedLen(objectSize)`. A
+   short body is a hard error. If Orca recorded a short chunk,
+   later requests would silently get truncated data. So the
+   leader refuses to commit, hands the error to the joiners, and
+   lets the next request try again.
+6. Stores the buffer on the fill entry and **releases joiners**
+   (closes `f.done`, wrapped in a `sync.Once` so it fires
+   exactly once) **before** writing to the cachestore.
+7. Writes to the cachestore via `PutObject` with
+   `If-None-Match: *`.
+8. On success, records the chunk in the catalog.
+9. On `ErrCommitLost` (the 412 from the cachestore), another
+   replica won the race. Stat the existing entry and record it
+   in the catalog on success.
+10. On any other error, log it and move on. The chunk is not
+    recorded; the next request refills (one extra origin GET in
+    the worst case). The client never sees this error because the
+    response already went out.
+
+Releasing joiners before the commit matters for cold-path
+time-to-first-byte. Joiners get their bytes as soon as the
+origin delivered them. Without the reorder, joiners would wait
+for both the origin round-trip and the cachestore commit
+round-trip before seeing any data.
+
+The buffer-write, validate, release-joiners, then commit
+sequence is safe because `bytes.Buffer`'s underlying slice
+doesn't change after the final `io.Copy`. So joiners' reads of
+`buf.Bytes()` and the cachestore `PutChunk`'s read of the same
+slice are independent reads of an unchanging region.
+
+There is no on-disk spool and no tee. The full chunk lives in
+memory until the commit returns. Peak memory per fill is one
+chunk (8 MiB by default). With the per-replica origin cap at 64,
+the worst-case buffer footprint per replica is around 512 MiB
+under full saturation.
+
+### 7.3 Cluster-wide deduplication via per-chunk fill RPC
+
+A hash on the chunk's identity picks one coordinator from the
+current peer set. The replica that took the client request is
+the assembler. For each chunk in the requested range:
+
+- **Hit** (the catalog or `Stat` says the chunk is there): the
+  assembler reads from the cachestore directly. No internal RPC.
+- **Miss, this replica is the coordinator:** run the local
+  singleflight ([s7.1](#71-per-chunkkey-singleflight)) and commit
+  ([s7.2](#72-singleflight--commit-after-serve)).
+- **Miss, some other replica is the coordinator:** the assembler
+  calls `GET /internal/fill?<chunk-key params>` on that replica's
+  internal listener ([s7.4](#74-internal-rpc-listener)). The
+  coordinator runs the singleflight + commit path locally and
+  streams the bytes back. The assembler stitches the bytes into
+  the client response, slicing the first and last chunks to
+  match the client's `Range`.
+
+**Loop prevention.** The assembler sets `X-Orca-Internal: 1` on
+internal RPCs. The internal handler checks
+`Cluster.IsCoordinator(k)`. If the receiving replica disagrees
+(peer membership has shifted), it returns 409 with
+`{"reason":"not_coordinator"}`. `FillFromPeer` recognizes this
+as `cluster.ErrPeerNotCoordinator` and the caller falls back to
+filling locally. The loser of the resulting commit race gets
+`ErrCommitLost`. Internal RPCs are never forwarded.
+
+**Wire format.**
+`GET /internal/fill?origin_id=...&bucket=...&key=...&etag=...&chunk_size=N&index=N&object_size=N`.
+`DecodeChunkKey` requires `chunk_size > 0`, `index >= 0`,
+`object_size > 0`, and a non-empty `origin_id` and `key`.
+Anything else is a 400.
+
+**Response framing.** The coordinator sets `Content-Length` to
+`ExpectedLen(objectSize)` and `Content-Type` to
+`application/octet-stream`. The caller wraps the response body
+in a `validatingReader` that checks the actual byte count
+against the advertised length. If they disagree it returns
+`io.ErrUnexpectedEOF`. This catches truncated cross-replica
+responses.
+
+### Diagram 5: Scenario D - cold miss, remote coordinator
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Client
+    participant A as Replica A (assembler)
+    participant B as Replica B (coordinator for k)
+    participant SF as Singleflight on B
+    participant O as Origin
+    participant CS as CacheStore
+    C->>A: GET /bucket/key Range
+    A->>A: rendezvous(k, peers) -> B
+    A->>B: GET /internal/fill?...&object_size=N<br/>X-Orca-Internal: 1
+    B->>B: IsCoordinator(k)?  yes
+    B->>SF: Acquire(k) [leader]
+    SF->>O: GetRange(..., If-Match: etag)<br/>(pre-header retry)
+    O-->>SF: full bytes
+    SF->>SF: validate buf.Len() == ExpectedLen
+    SF-->>B: bytes (in-memory)
+    B-->>A: 200 + Content-Length + stream<br/>(validatingReader on A's side)
+    A-->>C: stream sliced bytes
+    par async commit-after-serve on B
+        SF->>CS: PutChunk(If-None-Match: *)
+        CS-->>SF: commit_won or commit_lost
+    end
+    Note over A,B: 409 from B -> A falls back to local fill
+```
+
+A cold miss where the coordinator is a different replica. The
+assembler hands the work off, streams the bytes through, and the
+coordinator commits in the background. A 409 from the
+coordinator means peer membership has shifted; the assembler
+falls back to filling locally.
+
+### 7.4 Internal RPC listener
+
+The per-chunk fill RPC runs on its own port (default `:8444`,
+config `cluster.internal_listen`). That keeps cross-replica
+traffic off the client edge.
+
+In dev the listener is plain HTTP/2. Config keys exist for mTLS
+(`cluster.internal_tls.{enabled, cert_file, key_file, ca_file, server_name}`)
+but nothing enforces them yet. Production deployments rely on
+Kubernetes NetworkPolicy or equivalent to isolate the port, not
+on TLS at the listener.
+
+Loop prevention: the listener requires `X-Orca-Internal: 1` and
+checks `Cluster.IsCoordinator(k)`. Disagreement returns 409.
+
+The listener serves only `GET /internal/fill`. Health and
+readiness probes are on the ops listener; the client S3 API is
+on the edge listener.
+
+### 7.5 Metadata-layer singleflight
+
+Same pattern, at the metadata cache.
+`metadata.LookupOrFetch` maps each `(origin_id, bucket, key)`
+to a singleflight entry. So a flood of distinct cold keys
+generates at most one `Origin.Head` per object per replica per
+`metadata.ttl` window. Across the cluster that's up to N HEADs
+per object per window, where N is the peer count. A
+cluster-wide HEAD coordinator is future work.
+
+The entry is removed from the map **before** its `done` channel
+is closed, so a caller arriving in that brief window starts a
+fresh fetch instead of getting the old entry's cached error.
+The trade-off: under contention you might pay one extra HEAD
+per miss. In exchange a transient HEAD error never gets
+replayed to a later caller.
+
+### 7.6 Cancellation safety
+
+`runFill` runs on its own 5-minute context, so it finishes
+even when every caller has disconnected. The origin slot is
+released when `runFill` returns. A joiner that cancels only
+cancels itself (it `select`s between its context and
+`f.done`).
+
+If the leader's 5-minute context fires, the fill fails for the
+joiners too. Worst case Orca wasted one fill's worth of work,
+and the next request triggers a fresh one.
+
+### 7.7 Failure handling without re-stampede
+
+How each kind of failure is handled:
+
+- **Retryable origin errors during pre-header retry.** The
+  leader retries up to `origin.retry.attempts` (default 3)
+  within `origin.retry.max_total_duration` (default 5s), with
+  exponential backoff (`origin.retry.backoff_initial=100ms`,
+  `origin.retry.backoff_max=2s`). All this happens before any
+  HTTP header is sent, so the client never sees the transient
+  failure. If the budget runs out, the client gets a 502
+  `OriginUnreachable`.
+- **`OriginETagChangedError`.** Not retried. The leader
+  invalidates the metadata cache entry for
+  `(origin_id, bucket, key)` and returns the error. The next
+  request re-`Head`s, sees the new ETag, builds a new
+  `ChunkKey`, and refills under the new path.
+- **`origin.ErrNotFound`.** Not retried. Cached negatively for
+  `metadata.negative_ttl`. The client gets a 404.
+- **`UnsupportedBlobTypeError` / `MissingETagError`.** Not
+  retried. Cached negatively. The client gets a 502.
+- **Short body from the origin.** Hard error. `runFill` rejects
+  a body that doesn't match `ExpectedLen(objectSize)`. The fill
+  fails, the joiners see the error, and the catalog is not
+  updated. This is what stops a short fetch from poisoning the
+  catalog.
+- **Commit failure after the response is gone**
+  (`PutChunk` returns something other than `nil` or
+  `ErrCommitLost`). The client already has the bytes, so the
+  failure is invisible to them. The chunk is not recorded; the
+  next request will refill. A sustained rate of this is a
+  cachestore-health problem; today it's only visible in the
+  structured debug logs.
+- **CacheStore `ErrTransient` / `ErrAuth` during a read.** The
+  client gets a 502. Orca does not auto-refill, because that
+  would just hammer a backend that's already struggling.
+
+## 8. Atomic commit
+
+The leader publishes a chunk to the cachestore in one step that
+won't overwrite anything: `PutObject` with `If-None-Match: *`.
+The second concurrent commit for the same key gets HTTP 412 and
+is recorded as `ErrCommitLost`. So when two replicas race to
+fill the same chunk, exactly one wins; the loser treats the
+existing object as the truth.
+
+Joiners don't wait for the commit
+([s7.2](#72-singleflight--commit-after-serve)). They're released
+as soon as the leader's buffer is full and length-checked. The
+`PutChunk` RPC runs in parallel with the joiners' reads. If the
+commit fails, the client never knows; Orca just doesn't record
+the chunk, and the next request refills.
+
+**Boot-time self-test (`SelfTestAtomicCommit`).** At startup the
+`cachestore/s3` driver writes a probe key, then writes the same
+probe key again with `If-None-Match: "*"` and expects a 412. If
+the second write returns 200 (the backend silently overwrote),
+the driver refuses to start. This catches backends that don't
+implement the precondition. Verified backends today: AWS S3
+(since 2024-08), MinIO, VAST Cluster (only on non-versioned
+buckets).
+
+**Boot-time versioning gate.** The driver also runs
+`GetBucketVersioning(bucket)`. If versioning is `Enabled` or
+`Suspended`, startup fails with a clear error. VAST and several
+S3-compatible backends ignore `If-None-Match: *` on versioned
+buckets, which would silently break the atomic-commit rule.
+
+## 9. Bounded staleness contract
+
+Orca relies on a promise from the operator. It also caps the
+damage if the operator breaks the promise.
+
+### 9.1 The contract and the staleness window
+
+**The contract.** For any `(origin_id, bucket, object_key)`, the
+bytes never change once published. To change the data, publish
+a new key. Overwriting in place is breaking the promise.
+
+**Why this is enough.** The chunk's storage path includes its
+ETag (s5). New ETag, new path. So as long as operators publish
+new bytes under new keys, Orca cannot serve old bytes for a new
+key.
+
+**What happens if the promise is broken.** For up to 5 minutes
+(the default `metadata.ttl`), Orca may serve the old bytes.
+Here's why:
+
+- Object metadata (`size`, `etag`, `content_type`) is cached for
+  `metadata.ttl` so Orca doesn't re-`HEAD` on every request.
+- During that window, every request looks up the cached ETag,
+  builds the old `ChunkKey`, and serves from the old chunks.
+- When the window expires, the next request does a fresh `Head`,
+  sees the new ETag, builds a new `ChunkKey`, and refills.
+
+**Why this is OK for the target workload.** Orca is built for
+large immutable artifacts (job inputs, model weights, training
+shards). Those naturally fit the contract. The 5-minute window
+is the worst case, not the normal case. A new key gets the right
+ETag right away.
+
+**Safety net.** Every `Origin.GetRange` sends `If-Match: <etag>`.
+If an in-flight fetch races with an in-place overwrite, the
+origin returns 412 `PreconditionFailed`. The leader fails the
+fill and invalidates the metadata cache entry. This catches the
+narrow case where a violation happens between the `Head` and the
+`GetRange`. It does **not** catch a violation between two
+separate request lifecycles inside the same `metadata.ttl`
+window. The `metadata.ttl` cap is what bounds that case.
+
+## 10. Create-after-404 and negative-cache lifecycle
+
+### 10.1 The scenario
+
+The "I forgot to upload that" case. A client asks for key `K`.
+The origin doesn't have it yet. Orca caches the 404 and returns
+it. Then the operator uploads `K`. Orca keeps returning 404
+until the cached 404 expires.
+
+From the client's view, this looks the same as the operator
+breaking the no-overwrite rule (s9): the bytes for `K` changed
+without Orca knowing. There is no origin-to-cache invalidation,
+so all Orca can do is cap how long it serves the stale 404.
+
+### 10.2 Asymmetric TTLs
+
+The metadata cache uses two TTLs:
+
+| TTL | Default | Bounds | Why |
+|---|---|---|---|
+| `metadata.ttl` | 5m | how long Orca trusts a `200 + ETag` without re-`HEAD`ing | the contract holds in normal use, so trusting it longer cuts origin HEAD load |
+| `metadata.negative_ttl` | 60s | how long Orca trusts a `404`, `UnsupportedBlobTypeError`, or `MissingETagError` | operators do upload keys that someone already tried to fetch, so recovery should be quick |
+
+The two timeouts are different on purpose. The 5-minute timeout
+only matters if the operator breaks the no-overwrite rule. The
+60-second timeout matters every time someone uploads a key that
+a client already saw a 404 on - a normal thing that happens.
+
+The per-replica HEAD singleflight (s7.5) keeps the short
+negative TTL from creating HEAD storms. A flood of distinct
+missing keys produces at most one HEAD per object per replica
+per `metadata.negative_ttl`. At defaults (60s, 3 replicas) the
+origin sees at most 3 HEADs per missing key per minute, well
+under any documented S3 / Azure rate limit.
+
+### 10.3 Worst-case unavailability window
+
+After an operator uploads a key that someone already tried to
+fetch:
+
+- A replica that saw the original 404 keeps serving 404 for up
+  to `metadata.negative_ttl` from when **it** saw the 404, not
+  from when the upload happened. Orca has no way to know when
+  the upload happened.
+- A replica that did not see the 404 will `Head` fresh on the
+  first request and serve 200 right away.
+- Worst case across the cluster: `metadata.negative_ttl` after
+  the last replica's original 404. Under round-robin load
+  balancing, clients can see 404 and 200 alternating during the
+  drain.
+
+There is no way to actively invalidate (no origin push, no
+admin RPC). The workaround: after an upload, wait
+`metadata.negative_ttl` before telling anyone the key exists.
+
+### Diagram 6: Scenario G - create-after-404 timeline
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant Op as Operator
+    participant C as Client
+    participant A as Replica A
+    participant B as Replica B
+    participant O as Origin
+    Note over A,B: t=0  K not yet uploaded
+    C->>A: GET /bucket/K
+    A->>O: Head(K)
+    O-->>A: 404
+    Note over A: cache K -> 404<br/>TTL = metadata.negative_ttl (60s)
+    A-->>C: 404
+    Note over Op,O: t=30s  operator uploads K
+    Op->>O: PUT /bucket/K
+    Note over A,B: t=45s  drain window
+    C->>B: GET /bucket/K (LB routes to B)
+    B->>O: Head(K)
+    O-->>B: 200 + ETag
+    B->>O: GetRange (fill path)
+    O-->>B: bytes
+    B-->>C: 200 + bytes
+    Note over A,B: inconsistent results across replicas during drain
+    C->>A: GET /bucket/K (LB routes to A again)
+    Note over A: negative entry still valid<br/>age 45s less than 60s
+    A-->>C: 404 STALE
+    Note over A: t=60s+  negative entry expires
+    C->>A: GET /bucket/K (t=70s)
+    A->>O: Head(K)
+    O-->>A: 200 + ETag
+    A->>O: GetRange (fill path)
+    O-->>A: bytes
+    A-->>C: 200 + bytes
+    Note over A,B: drain complete - replicas consistent
+```
+
+A timeline of the drain. Replica A saw the 404; replica B did
+not. During the window between the upload and the cache expiry,
+clients can get a 200 from B and a 404 from A on the same key.
+
+## 11. Eviction and capacity
+
+### 11.1 Passive eviction (lifecycle)
+
+Eviction is the cachestore's job, not Orca's. The recommended
+setup is age-based expiration on the chunk prefix, with the
+expiry chosen to fit the working set in the available capacity.
+Storage paths start with `origin_id`, so an operator can set a
+different lifecycle for each deployment that shares a bucket.
+
+For AWS S3, MinIO, and VAST, the bucket lifecycle policy handles
+this. Configure it on the bucket.
+
+The `cachestore.CacheStore` interface has a `Delete(k)` method,
+but production code doesn't call it. The method is there so a
+future active-eviction loop can use it; see
+[Deferred / future work](#13-deferred--future-work).
+
+### 11.2 ChunkCatalog size
+
+The catalog is capped by `chunk_catalog.max_entries` (default
+100,000). Each entry is roughly 80 bytes (the path string plus a
+list pointer), so the default is about 8 MB per replica.
+Operators with very large active working sets should size the
+catalog to a multiple of the expected chunk count (working set /
+chunk size).
+
+A catalog smaller than the working set is still correct, just
+slower: cold lookups fall through to `CacheStore.Stat`. The
+cachestore is always the truth.
+
+### 11.3 `chunk_size` config-change capacity impact
+
+Changing `chunk_size` orphans the old chunks (s5). Storage
+roughly doubles for a while as the working set rebuilds at the
+new size. The bucket lifecycle policy ages the orphaned chunks
+out.
+
+### 11.4 Per-fill memory
+
+Peak memory per fill is one chunk, at whatever size the tier
+ladder picked for that object. With the default ladder, that's
+8 MiB for small objects, up to 128 MiB for objects over 10 GiB.
+
+The per-replica origin cap is
+`floor(target_global / target_replicas)`. On a 4-replica cluster
+with `target_global = 64`, that's 16 concurrent fills.
+
+So the worst case per replica is `16 fills * 128 MiB = 2 GiB` of
+in-flight chunk buffers when many large objects are being filled
+at the same time.
+
+Operators with tighter memory budgets should remove the top tier
+or lower its chunk size. Read-ahead does not change this number
+- the cap on cold fills is what bounds memory.
+
+## 12. Horizontal scale
+
+Cluster membership comes from the headless Service. A DNS
+A-record lookup returns the IPs of all Ready pods. The cluster
+package polls that list every `cluster.membership_refresh`
+(default 5s), and the hash on chunk identity picks a coordinator
+per chunk. The assembler reads from the cachestore on a hit,
+runs the local singleflight if it's the coordinator, or calls
+`GET /internal/fill?<chunk-key params>` otherwise.
+
+Pod names are not stable under a Deployment. Orca addresses
+peers only by IP, not by name.
+
+The cachestore stores one copy of each chunk. If a chunk is lost,
+Orca refills from the origin. Every replica can read every
+chunk; no replica owns any bytes, so losing a replica never
+strands data.
+
+**What happens if the peer set is empty.** If `Cluster.Peers()`
+comes back empty - the Service has no Ready endpoints, DNS
+returns NXDOMAIN, or CoreDNS is broken - the replica treats
+itself as the only peer. The hash picks self for every chunk and
+every fill runs locally. Orca keeps serving; the only loss is
+that cluster-wide dedup falls back to per-replica dedup until
+DNS recovers. No process restart is needed.
+
+**What happens when a refresh fails.** On a DNS error or peer-
+source error, the cluster keeps the previous (non-empty) peer
+list rather than wiping it to `[Self]`. After 5 failures in a
+row (`maxStalePeerRefreshes`) it falls back to `[Self]`. That
+bounds how long Orca routes to dead peers. A `context.Canceled`
+during graceful shutdown doesn't count toward the streak.
+
+**`/readyz` predicate.** `/readyz` only flips to 200 after at
+least one successful peer-set snapshot. So if DNS is broken end
+to end the replica stays `NotReady` and gets drained, even
+though the empty-peer fallback would otherwise let it serve.
+
+**Rolling restarts.** Pod IPs change during a rolling restart,
+and the new IPs take up to `cluster.membership_refresh` to
+propagate. During that window the assembler and the new replica
+can disagree on who owns a chunk. The assembler routes to a
+stale IP and either gets `connection refused` (and falls back to
+filling locally) or reaches the wrong replica (which returns 409
+`not_coordinator`, and the assembler falls back). Either way,
+the loser of the resulting commit race gets `ErrCommitLost`. No
+duplicate bytes are written.
+
+### Diagram 7: Membership flux during rolling restart
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant A as Replica A
+    participant DNS as headless Service DNS
+    participant B as Replica B (old IP)
+    participant Bp as Replica B' (new IP)
+    participant CS as CacheStore
+    Note over A,B: t=0  peers (A's view) = {A, B}<br/>chunk k owned by B
+    A->>DNS: refresh
+    DNS-->>A: [ip(A), ip(B)]
+    Note over B,Bp: t=5s  rolling restart: B terminates,<br/>B' starts with a new IP
+    Note over A: A's cached membership still {A, B}<br/>until next refresh
+    A->>A: rendezvous(k, {A,B}) = B (stale)
+    A->>B: /internal/fill (connection refused)
+    A->>A: fallback: fill locally
+    A->>CS: PutChunk(If-None-Match: *)
+    Note over Bp: B' bootstraps, refreshes DNS<br/>peers (B's view) = {A, B'}
+    Bp->>Bp: rendezvous(k, {A,B'}) = B'
+    Bp->>CS: PutChunk(If-None-Match: *)
+    CS-->>A: 200 commit_won
+    CS-->>Bp: 412 commit_lost (ErrCommitLost)
+    Note over A,Bp: at-most-one duplicate fill per chunk
+    Note over A,DNS: t=10s  A refreshes DNS<br/>peers converge to {A, B'}<br/>steady state restored
+```
+
+A walks through B being replaced by B'. A still thinks B owns
+chunk k, tries B's old IP, fails, and fills locally. Meanwhile
+B' boots, decides it owns k, and fills too. Both write to the
+cachestore. The atomic-commit rule means only one write sticks;
+the other gets `ErrCommitLost`. No corruption.
+
+## 13. Deferred / future work
+
+Things considered and not built. None requires breaking
+existing interfaces. Build each when there's measured evidence
+that justifies the extra surface area.
+
+### Auth enforcement on edge and internal listeners
+
+The edge handler checks `cfg.Server.Auth.Enabled` and returns
+401 if it's true, but nothing actually checks bearer tokens or
+mTLS client certs. The internal listener takes plain HTTP/2 in
+dev; the `cluster.internal_tls.*` config keys are read but
+nothing does the TLS handshake. Production deployments rely on
+Kubernetes NetworkPolicy (or equivalent network isolation)
+today.
+
+Building this means: a real bearer-token check (HMAC against a
+Kubernetes Secret), mTLS plumbing on both listeners with
+separate trust roots, and a peer-IP check on the internal
+listener.
+
+### Posix-shared cachestore drivers
+
+`cachestore/posixfs` (shared POSIX filesystems: NFSv4.1+, Weka
+native, CephFS, Lustre, GPFS) and `cachestore/localfs` (dev)
+were designed and not built. The atomic-commit primitive there
+is `link()` returning `EEXIST` (or
+`renameat2(RENAME_NOREPLACE)`). The posixfs flavor adds backend
+detection, an NFS minimum-version check, refusal on Alluxio
+FUSE, and a 2-character hex path fan-out. Both would share
+helpers via `internal/orca/cachestore/internal/posixcommon/`.
+
+These would let Orca run against shared-filesystem deployments
+that don't have an in-DC S3-compatible object store. The
+`SelfTestAtomicCommit` hook on `CacheStore` is already shaped to
+absorb them.
+
+### Prometheus metrics
+
+There are no Prometheus collectors yet. The diagnostic surface
+today is structured `slog` output (debug-level traces through
+every chunk-resolution decision, switchable via
+`logging.level` or `ORCA_LOG_LEVEL`).
+
+The metric families that would matter:
+- `orca_origin_*` (HEAD / GetRange counts, retry outcomes,
+  duplicate fills, ETag-changed).
+- `orca_cachestore_*` (put / get / stat counts, commit
+  outcomes).
+- `orca_commit_after_serve_total{ok|failed}`.
+- `orca_origin_inflight` (per-replica origin semaphore gauge).
+- `orca_fills_inflight` (per-replica singleflight map size).
+- `orca_cluster_*` (peer-set size, refresh outcomes, internal-
+  fill duration, direction, 409 rate).
+- `orca_metadata_*` (positive / negative counts and ages).
+- `orca_chunk_catalog_hit_rate`.
+
+A Grafana dashboard is part of the work.
+
+### CacheStore circuit breaker
+
+A per-process circuit breaker around cachestore calls. Sustained
+`ErrTransient` or `ErrAuth` would short-circuit writes so Orca
+doesn't keep hammering a backend that's already in trouble.
+Defaults considered: 10 errors per 30s window, 30s open, 3
+half-open probes. It would also flip `/readyz` to `NotReady` on
+sustained `ErrAuth`, and gate any future active-eviction loop's
+`Delete` calls.
+
+### LIST cache and cluster-wide LIST coordinator
+
+The LIST handler is a pass-through today. A per-replica LIST
+cache keyed on
+`(origin_id, bucket, prefix, continuation_token, start_after, delimiter, max_keys)`
+would absorb FUSE `ls` workloads (`list_cache.ttl=60s` default,
+`list_cache.max_entries=1024`). A cluster-wide LIST coordinator
+on the same query tuple is the next step. Both need
+`409`-fallback semantics like the chunk-fill coordinator.
+
+### Active eviction loop
+
+An opt-in background loop
+(`chunk_catalog.active_eviction.enabled`) that uses
+access-frequency tracking on the catalog to `CacheStore.Delete`
+cold chunks. Requires extending the catalog to record
+`AccessCount`, `LastAccessed`, and `LastEntered` per entry. The
+`Delete` method on `CacheStore` exists for this. Useful for
+posixfs deployments that don't have external sweep tooling.
+
+### Bounded-freshness mode
+
+An opt-in (`metadata_refresh.enabled`) per-replica background
+loop that re-`Head`s hot keys before `metadata.ttl` expires.
+That shrinks the effective staleness window for popular keys
+from `metadata.ttl` to `refresh_ahead_ratio * metadata.ttl`
+(e.g. 3.5 minutes). Hot-key detection uses access counters on
+the metadata cache.
+
+### Cluster-wide HEAD singleflight
+
+A second coordinator role (`Cluster.HeadCoordinator(ObjectKey)`)
+alongside the chunk-fill coordinator. With it, the cluster does
+exactly one `Origin.Head` per object per `metadata.ttl` window
+instead of N. Only justified at much larger peer-set sizes than
+the documented 3-5 replicas.
+
+### Coordinated cluster-wide origin limiter
+
+A Kubernetes-Lease-elected authority that hands out slot-lease
+tokens to peers, replacing the per-replica static cap with a
+true cluster-wide cap on `Origin.GetRange` calls. Lots of moving
+parts (election, slot-lease tokens, batching, fallback mode,
+RBAC). Only worth it when the peer set grows past 10-ish and
+individual replicas show sustained slot under-utilization.
+
+### Dynamic per-replica origin cap
+
+Compute `target_per_replica` at runtime from
+`len(Cluster.Peers())` instead of from the static
+`cluster.target_replicas` config knob. Helpful for HPA-driven
+autoscaling, or when operators routinely change replica count
+and forget to update the config.
+
+### Mid-stream origin resume
+
+Today, if the origin disconnects after Orca has sent any bytes
+to the client, the response just ends; S3 SDKs retry from
+scratch. A resume path would re-issue `Origin.GetRange` with
+`Range: bytes=<offset>-` and keep feeding the client invisibly.
+Trade-off: real state-tracking work, plus interaction with the
+singleflight joiners. SDK retry already handles this case.
+
+### Per-request correlation IDs
+
+Threading a request-scoped logger through every fetch coordinator
+method needs ctx propagation in a lot of places. The shared
+`slog.Group("chunk", ...)` taxonomy plus `AddSource: true`
+already give cross-package correlation by chunk identity.
+
+### Orphan-chunk garbage collection
+
+When an origin ETag rotates, the old chunks under
+`<origin_id>/<old-hash>/...` stay in the cachestore until the
+bucket lifecycle policy deletes them. The atomic-commit rule
+means there's no corruption; the only cost is storage growth in
+proportion to the rotation rate. A real GC would walk the
+cachestore and remove chunks whose
+`(origin_id, bucket, key, etag)` no longer matches the current
+origin `Head`. That's a lot of code for a problem that
+lifecycle policies already handle in production.
+
+### Singleflight context propagation
+
+If the leader's request context cancels, the joiners get the
+leader's error rather than continuing to wait on the fill (which
+is on its own 5-minute context anyway). Self-healing on the
+next request. Fixing this means restructuring the singleflight
+join to outlive the leader's caller; a lot of work for a small
+TTFB win.
+
+### Origin-semaphore starvation under cancellation storms
+
+A flood of cancelled requests can briefly hold origin slots
+between acquire and the deferred release. Operational concern
+only; no observed incident. Need metrics first.
diff --git a/go.mod b/go.mod
index 9fdc87a3..49794bf4 100644
--- a/go.mod
+++ b/go.mod
@@ -26,6 +26,11 @@ require (
 	github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4
 	github.com/Masterminds/semver/v3 v3.4.0
 	github.com/Masterminds/sprig/v3 v3.3.0
+	github.com/aws/aws-sdk-go-v2 v1.41.7
+	github.com/aws/aws-sdk-go-v2/config v1.32.17
+	github.com/aws/aws-sdk-go-v2/credentials v1.19.16
+	github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0
+	github.com/aws/smithy-go v1.25.1
 	github.com/bougou/go-ipmi v0.8.3
 	github.com/cilium/ebpf v0.21.0
 	github.com/coder/websocket v1.8.14
@@ -49,6 +54,7 @@ require (
 	github.com/spf13/cobra v1.10.2
 	github.com/spf13/pflag v1.0.10
 	github.com/stretchr/testify v1.11.1
+	github.com/testcontainers/testcontainers-go v0.42.0
 	github.com/vishvananda/netlink v1.3.1
 	golang.org/x/crypto v0.50.0
 	golang.org/x/mod v0.35.0
@@ -73,27 +79,51 @@ require (
 )
 
 require (
-	dario.cat/mergo v1.0.1 // indirect
-	github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1 // indirect
+	dario.cat/mergo v1.0.2 // indirect
+	github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 // indirect
 	github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect
-	github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
+	github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
 	github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect
 	github.com/Masterminds/goutils v1.1.1 // indirect
+	github.com/Microsoft/go-winio v0.6.2 // indirect
 	github.com/apex/log v1.9.0 // indirect
+	github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 // indirect
+	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 // indirect
+	github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 // indirect
+	github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 // indirect
+	github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 // indirect
+	github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/blang/semver/v4 v4.0.0 // indirect
+	github.com/cenkalti/backoff/v4 v4.3.0 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
+	github.com/containerd/errdefs v1.0.0 // indirect
+	github.com/containerd/errdefs/pkg v0.3.0 // indirect
 	github.com/containerd/log v0.1.0 // indirect
+	github.com/cpuguy83/dockercfg v0.3.2 // indirect
 	github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
 	github.com/cyphar/filepath-securejoin v0.5.0 // indirect
 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
+	github.com/distribution/reference v0.6.0 // indirect
+	github.com/docker/go-connections v0.6.0 // indirect
 	github.com/docker/go-units v0.5.0 // indirect
 	github.com/dustin/go-humanize v1.0.1 // indirect
+	github.com/ebitengine/purego v0.10.0 // indirect
 	github.com/emicklei/go-restful/v3 v3.12.2 // indirect
 	github.com/evanphx/json-patch/v5 v5.9.11 // indirect
+	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
 	github.com/go-errors/errors v1.4.2 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/go-logr/zapr v1.3.0 // indirect
+	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
 	github.com/go-openapi/jsonreference v0.20.2 // indirect
 	github.com/go-openapi/swag v0.23.0 // indirect
@@ -110,12 +140,14 @@ require (
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/josharian/native v1.1.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
-	github.com/klauspost/compress v1.18.0 // indirect
+	github.com/klauspost/compress v1.18.5 // indirect
 	github.com/klauspost/pgzip v1.2.6 // indirect
 	github.com/kr/pretty v0.3.1 // indirect
 	github.com/kr/text v0.2.0 // indirect
 	github.com/kylelemons/godebug v1.1.0 // indirect
 	github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
+	github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
+	github.com/magiconair/properties v1.8.10 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/mattn/go-colorable v0.1.14 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
@@ -125,10 +157,16 @@ require (
 	github.com/mdlayher/socket v0.5.1 // indirect
 	github.com/mitchellh/copystructure v1.2.0 // indirect
 	github.com/mitchellh/reflectwalk v1.0.2 // indirect
+	github.com/moby/docker-image-spec v1.3.1 // indirect
+	github.com/moby/go-archive v0.2.0 // indirect
+	github.com/moby/moby/api v1.54.1 // indirect
+	github.com/moby/moby/client v0.4.0 // indirect
+	github.com/moby/patternmatcher v0.6.1 // indirect
 	github.com/moby/spdystream v0.5.1 // indirect
+	github.com/moby/sys/sequential v0.6.0 // indirect
 	github.com/moby/sys/user v0.4.0 // indirect
 	github.com/moby/sys/userns v0.1.0 // indirect
-	github.com/moby/term v0.5.0 // indirect
+	github.com/moby/term v0.5.2 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
 	github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
@@ -145,6 +183,7 @@ require (
 	github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
+	github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
 	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/prometheus/common v0.66.1 // indirect
 	github.com/prometheus/procfs v0.16.1 // indirect
@@ -153,10 +192,13 @@ require (
 	github.com/rogpeppe/go-internal v1.14.1 // indirect
 	github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 // indirect
 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
+	github.com/shirou/gopsutil/v4 v4.26.3 // indirect
 	github.com/shopspring/decimal v1.4.0 // indirect
-	github.com/sirupsen/logrus v1.9.3 // indirect
+	github.com/sirupsen/logrus v1.9.4 // indirect
 	github.com/sony/gobreaker/v2 v2.4.0 // indirect
 	github.com/spf13/cast v1.7.0 // indirect
+	github.com/tklauser/go-sysconf v0.3.16 // indirect
+	github.com/tklauser/numcpus v0.11.0 // indirect
 	github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 // indirect
 	github.com/urfave/cli v1.22.12 // indirect
 	github.com/vbatts/go-mtree v0.6.1-0.20250911112631-8307d76bc1b9 // indirect
@@ -164,6 +206,12 @@ require (
 	github.com/x448/float16 v0.8.4 // indirect
 	github.com/xlab/treeprint v1.2.0 // indirect
 	github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect
+	github.com/yusufpapurcu/wmi v1.2.4 // indirect
+	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
+	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
+	go.opentelemetry.io/otel v1.41.0 // indirect
+	go.opentelemetry.io/otel/metric v1.41.0 // indirect
+	go.opentelemetry.io/otel/trace v1.41.0 // indirect
 	go.uber.org/multierr v1.11.0 // indirect
 	go.uber.org/zap v1.27.0 // indirect
 	go.yaml.in/yaml/v2 v2.4.3 // indirect
@@ -172,7 +220,7 @@ require (
 	golang.org/x/oauth2 v0.34.0 // indirect
 	golang.org/x/telemetry v0.0.0-20260409153401-be6f6cb8b1fa // indirect
 	golang.org/x/text v0.36.0 // indirect
-	golang.org/x/time v0.9.0 // indirect
+	golang.org/x/time v0.11.0 // indirect
 	golang.org/x/tools v0.44.0 // indirect
 	golang.org/x/vuln v1.2.0 // indirect
 	golang.zx2c4.com/wireguard v0.0.0-20231211153847-12269c276173 // indirect
diff --git a/go.sum b/go.sum
index 91bab086..3cf29662 100644
--- a/go.sum
+++ b/go.sum
@@ -1,9 +1,9 @@
 cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4=
 cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4=
-dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s=
-dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
-github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1 h1:EKPd1INOIyr5hWOWhvpmQpY6tKjeG0hT1s3AMC/9fic=
-github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1/go.mod h1:VzwV+t+dZ9j/H867F1M2ziD+yLHtB46oM35FxxMJ4d0=
+dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8=
+dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA=
+github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk=
+github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8=
 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1 h1:jHb/wfvRikGdxMXYV3QG/SzUOPYN9KEUUuC0Yd0/vC0=
 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1/go.mod h1:pzBXCYn05zvYIrwLgtK8Ap8QcjRg+0i76tMQdWN6wOk=
 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4=
@@ -46,8 +46,8 @@ github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.8.1
 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.8.1/go.mod h1:Ng3urmn6dYe8gnbCMoHHVl5APYz2txho3koEkV2o2HA=
 github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4 h1:jWQK1GI+LeGGUKBADtcH2rRqPxYB1Ljwms5gFA2LqrM=
 github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4/go.mod h1:8mwH4klAm9DUgR2EEHyEEAQlRDvLPyg5fQry3y+cDew=
-github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
-github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
+github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg=
+github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
 github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM=
 github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE=
 github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs=
@@ -59,6 +59,8 @@ github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1
 github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
 github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe3tPhs=
 github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0=
+github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
+github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
 github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
 github.com/apex/log v1.9.0 h1:FHtw/xuaM8AgmvDDTI9fiwoAL25Sq2cxojnZICUU8l0=
@@ -69,6 +71,42 @@ github.com/aphistic/sweet v0.2.0/go.mod h1:fWDlIh/isSE9n6EPsRmC0det+whmX6dJid3st
 github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
 github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
 github.com/aws/aws-sdk-go v1.20.6/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
+github.com/aws/aws-sdk-go-v2 v1.41.7 h1:DWpAJt66FmnnaRIOT/8ASTucrvuDPZASqhhLey6tLY8=
+github.com/aws/aws-sdk-go-v2 v1.41.7/go.mod h1:4LAfZOPHNVNQEckOACQx60Y8pSRjIkNZQz1w92xpMJc=
+github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 h1:gx1AwW1Iyk9Z9dD9F4akX5gnN3QZwUB20GGKH/I+Rho=
+github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10/go.mod h1:qqY157uZoqm5OXq/amuaBJyC9hgBCBQnsaWnPe905GY=
+github.com/aws/aws-sdk-go-v2/config v1.32.17 h1:FpL4/758/diKwqbytU0prpuiu60fgXKUWCpDJtApclU=
+github.com/aws/aws-sdk-go-v2/config v1.32.17/go.mod h1:OXqUMzgXytfoF9JaKkhrOYsyh72t9G+MJH8mMRaexOE=
+github.com/aws/aws-sdk-go-v2/credentials v1.19.16 h1:r3RJBuU7X9ibt8RHbMjWE6y60QbKBiII6wSrXnapxSU=
+github.com/aws/aws-sdk-go-v2/credentials v1.19.16/go.mod h1:6cx7zqDENJDbBIIWX6P8s0h6hqHC8Avbjh9Dseo27ug=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 h1:UuSfcORqNSz/ey3VPRS8TcVH2Ikf0/sC+Hdj400QI6U=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23/go.mod h1:+G/OSGiOFnSOkYloKj/9M35s74LgVAdJBSD5lsFfqKg=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 h1:GpT/TrnBYuE5gan2cZbTtvP+JlHsutdmlV2YfEyNde0=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23/go.mod h1:xYWD6BS9ywC5bS3sz9Xh04whO/hzK2plt2Zkyrp4JuA=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 h1:bpd8vxhlQi2r1hiueOw02f/duEPTMK59Q4QMAoTTtTo=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23/go.mod h1:15DfR2nw+CRHIk0tqNyifu3G1YdAOy68RftkhMDDwYk=
+github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 h1:OQqn11BtaYv1WLUowvcA30MpzIu8Ti4pcLPIIyoKZrA=
+github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24/go.mod h1:X5ZJyfwVrWA96GzPmUCWFQaEARPR7gCrpq2E92PJwAE=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 h1:FLudkZLt5ci0ozzgkVo8BJGwvqNaZbTWb3UcucAateA=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9/go.mod h1:w7wZ/s9qK7c8g4al+UyoF1Sp/Z45UwMGcqIzLWVQHWk=
+github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 h1:ieLCO1JxUWuxTZ1cRd0GAaeX7O6cIxnwk7tc1LsQhC4=
+github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15/go.mod h1:e3IzZvQ3kAWNykvE0Tr0RDZCMFInMvhku3qNpcIQXhM=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 h1:pbrxO/kuIwgEsOPLkaHu0O+m4fNgLU8B3vxQ+72jTPw=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23/go.mod h1:/CMNUqoj46HpS3MNRDEDIwcgEnrtZlKRaHNaHxIFpNA=
+github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 h1:03xatSQO4+AM1lTAbnRg5OK528EUg744nW7F73U8DKw=
+github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23/go.mod h1:M8l3mwgx5ToK7wot2sBBce/ojzgnPzZXUV445gTSyE8=
+github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0 h1:etqBTKY581iwLL/H/S2sVgk3C9lAsTJFeXWFDsDcWOU=
+github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0/go.mod h1:L2dcoOgS2VSgbPLvpak2NyUPsO1TBN7M45Z4H7DlRc4=
+github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 h1:TdJ+HdzOBhU8+iVAOGUTU63VXopcumCOF1paFulHWZc=
+github.com/aws/aws-sdk-go-v2/service/signin v1.0.11/go.mod h1:R82ZRExE/nheo0N+T8zHPcLRTcH8MGsnR3BiVGX0TwI=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 h1:7byT8HUWrgoRp6sXjxtZwgOKfhss5fW6SkLBtqzgRoE=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.17/go.mod h1:xNWknVi4Ezm1vg1QsB/5EWpAJURq22uqd38U8qKvOJc=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 h1:+1Kl1zx6bWi4X7cKi3VYh29h8BvsCoHQEQ6ST9X8w7w=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21/go.mod h1:4vIRDq+CJB2xFAXZ+YgGUTiEft7oAQlhIs71xcSeuVg=
+github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 h1:F/M5Y9I3nwr2IEpshZgh1GeHpOItExNM9L1euNuh/fk=
+github.com/aws/aws-sdk-go-v2/service/sts v1.42.1/go.mod h1:mTNxImtovCOEEuD65mKW7DCsL+2gjEH+RPEAexAzAio=
+github.com/aws/smithy-go v1.25.1 h1:J8ERsGSU7d+aCmdQur5Txg6bVoYelvQJgtZehD12GkI=
+github.com/aws/smithy-go v1.25.1/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc=
 github.com/aybabtme/rgbterm v0.0.0-20170906152045-cc83f3b3ce59/go.mod h1:q/89r3U2H7sSsE2t6Kca0lfwTK8JdoNGS/yzM/4iH5I=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
@@ -84,29 +122,41 @@ github.com/cilium/ebpf v0.21.0 h1:4dpx1J/B/1apeTmWBH5BkVLayHTkFrMovVPnHEk+l3k=
 github.com/cilium/ebpf v0.21.0/go.mod h1:1kHKv6Kvh5a6TePP5vvvoMa1bclRyzUXELSs272fmIQ=
 github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g=
 github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg=
+github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
+github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
+github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE=
+github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk=
 github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
 github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo=
 github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A=
 github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw=
 github.com/coreos/go-iptables v0.8.0 h1:MPc2P89IhuVpLI7ETL/2tx3XZ61VeICZjYqDEgNsPRc=
 github.com/coreos/go-iptables v0.8.0/go.mod h1:Qe8Bv2Xik5FyTXwgIbLAnv2sWSBmvWdFETJConOQ//Q=
+github.com/cpuguy83/dockercfg v0.3.2 h1:DlJTyZGBDlXqUZ2Dk2Q3xHs/FtnooJJVaad2S9GKorA=
+github.com/cpuguy83/dockercfg v0.3.2/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
 github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY=
-github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4=
+github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s=
+github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE=
 github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw=
 github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
+github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
+github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94=
+github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE=
 github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
 github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
+github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU=
+github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
 github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU=
 github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
 github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k=
@@ -128,12 +178,15 @@ github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj2
 github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA=
 github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og=
 github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
 github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
 github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
 github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
 github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
+github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
+github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
 github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
 github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
 github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
@@ -166,6 +219,7 @@ github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7O
 github.com/google/go-cmdtest v0.4.1-0.20220921163831-55ab3332a786 h1:rcv+Ippz6RAtvaGgKxc+8FQIpxHgsF+HBzPyYL2cyVU=
 github.com/google/go-cmdtest v0.4.1-0.20220921163831-55ab3332a786/go.mod h1:apVn/GCasLZUVpAJ6oWAuyP7Ne7CEsQbTnc0plM3m+o=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/go-configfs-tsm v0.3.3-0.20240919001351-b4b5b84fdcbc h1:SG12DWUUM5igxm+//YX5Yq4vhdoRnOG9HkCodkOn+YU=
@@ -220,8 +274,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU=
 github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k=
-github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
-github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
+github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE=
+github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
 github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
 github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
@@ -239,6 +293,10 @@ github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ=
 github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA=
 github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0=
 github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE=
+github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
+github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
+github.com/magiconair/properties v1.8.10 h1:s31yESBquKXCV9a/ScB3ESkOjUYYv+X0rg8SYxI99mE=
+github.com/magiconair/properties v1.8.10/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ=
@@ -266,14 +324,26 @@ github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa1
 github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s=
 github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ=
 github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
+github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
+github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
+github.com/moby/go-archive v0.2.0 h1:zg5QDUM2mi0JIM9fdQZWC7U8+2ZfixfTYoHL7rWUcP8=
+github.com/moby/go-archive v0.2.0/go.mod h1:mNeivT14o8xU+5q1YnNrkQVpK+dnNe/K6fHqnTg4qPU=
+github.com/moby/moby/api v1.54.1 h1:TqVzuJkOLsgLDDwNLmYqACUuTehOHRGKiPhvH8V3Nn4=
+github.com/moby/moby/api v1.54.1/go.mod h1:+RQ6wluLwtYaTd1WnPLykIDPekkuyD/ROWQClE83pzs=
+github.com/moby/moby/client v0.4.0 h1:S+2XegzHQrrvTCvF6s5HFzcrywWQmuVnhOXe2kiWjIw=
+github.com/moby/moby/client v0.4.0/go.mod h1:QWPbvWchQbxBNdaLSpoKpCdf5E+WxFAgNHogCWDoa7g=
+github.com/moby/patternmatcher v0.6.1 h1:qlhtafmr6kgMIJjKJMDmMWq7WLkKIo23hsrpR3x084U=
+github.com/moby/patternmatcher v0.6.1/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc=
 github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y=
 github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
+github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU=
+github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko=
 github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs=
 github.com/moby/sys/user v0.4.0/go.mod h1:bG+tYYYJgaMtRKgEmuueC0hJEAZWwtIbZTB+85uoHjs=
 github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g=
 github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
-github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
-github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
+github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
+github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -331,6 +401,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU=
+github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
 github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
 github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
 github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
@@ -354,10 +426,12 @@ github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD
 github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
 github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ=
 github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
+github.com/shirou/gopsutil/v4 v4.26.3 h1:2ESdQt90yU3oXF/CdOlRCJxrP+Am1aBYubTMTfxJ1qc=
+github.com/shirou/gopsutil/v4 v4.26.3/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ=
 github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
 github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
-github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
-github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
+github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w=
+github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g=
 github.com/smartystreets/assertions v1.0.0/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM=
 github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM=
 github.com/smartystreets/gunit v1.0.0/go.mod h1:qwPWnhz6pn0NnRBP++URONOVyNkPyr4SauJk4cUOwJs=
@@ -375,8 +449,8 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
-github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
-github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
+github.com/stretchr/objx v0.5.3 h1:jmXUvGomnU1o3W/V5h2VEradbpJDwGrzugQQvL0POH4=
+github.com/stretchr/objx v0.5.3/go.mod h1:rDQraq+vQZU7Fde9LOZLr8Tax6zZvy4kuNKF+QYS+U0=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
@@ -385,6 +459,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+github.com/testcontainers/testcontainers-go v0.42.0 h1:He3IhTzTZOygSXLJPMX7n44XtK+qhjat1nI9cneBbUY=
+github.com/testcontainers/testcontainers-go v0.42.0/go.mod h1:vZjdY1YmUA1qEForxOIOazfsrdyORJAbhi0bp8plN30=
 github.com/tj/assert v0.0.0-20171129193455-018094318fb0/go.mod h1:mZ9/Rh9oLWpLLDRpvE+3b7gP/C2YyLFYxNmcLnPTMe0=
 github.com/tj/assert v0.0.3 h1:Df/BlaZ20mq6kuai7f5z2TvPFiwC3xaWJSDQNiIS3Rk=
 github.com/tj/assert v0.0.3/go.mod h1:Ne6X72Q+TB1AteidzQncjw9PabbMp4PBMZ1k+vd1Pvk=
@@ -392,6 +468,10 @@ github.com/tj/go-buffer v1.1.0/go.mod h1:iyiJpfFcR2B9sXu7KvjbT9fpM4mOelRSDTbntVj
 github.com/tj/go-elastic v0.0.0-20171221160941-36157cbbebc2/go.mod h1:WjeM0Oo1eNAjXGDx2yma7uG2XoyRZTq1uv3M/o7imD0=
 github.com/tj/go-kinesis v0.0.0-20171128231115-08b17f58cb1b/go.mod h1:/yhzCV0xPfx6jb1bBgRFjl5lytqVqZXEaeqWP8lTEao=
 github.com/tj/go-spin v1.1.0/go.mod h1:Mg1mzmePZm4dva8Qz60H2lHwmJ2loum4VIrLgVnKwh4=
+github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
+github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
+github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
+github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
 github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 h1:tHNk7XK9GkmKUR6Gh8gVBKXc2MVSZ4G/NnWLtzw4gNA=
 github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923/go.mod h1:eLL9Nub3yfAho7qB0MzZizFhTU2QkLeoVsWdHtDW264=
 github.com/urfave/cli v1.22.12 h1:igJgVw1JdKH+trcLWLeLwZjU9fEfPesQ+9/e4MQ44S8=
@@ -408,6 +488,8 @@ github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ=
 github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0=
 github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM=
 github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI=
+github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
+github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
 go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
 go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
@@ -460,9 +542,10 @@ golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5h
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220622161953-175b2fd9d664/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -477,8 +560,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
 golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg=
 golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164=
-golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
-golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
+golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0=
+golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c=
 golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI=
@@ -526,6 +609,8 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C
 gopkg.in/yaml.v3 v3.0.0-20200605160147-a5ece683394c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q=
+gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA=
 k8s.io/api v0.35.4 h1:P7nFYKl5vo9AGUp1Z+Pmd3p2tA7bX2wbFWCvDeRv988=
 k8s.io/api v0.35.4/go.mod h1:yl4lqySWOgYJJf9RERXKUwE9g2y+CkuwG+xmcOK8wXU=
 k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4=
@@ -582,6 +667,8 @@ modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
 modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
 oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc=
 oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o=
+pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk=
+pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04=
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM=
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
 sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80=
diff --git a/hack/cmd/orcaseed/main.go b/hack/cmd/orcaseed/main.go
new file mode 100644
index 00000000..23e83cac
--- /dev/null
+++ b/hack/cmd/orcaseed/main.go
@@ -0,0 +1,10 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package main
+
+import "github.com/Azure/unbounded/hack/cmd/orcaseed/orcaseed"
+
+func main() {
+	orcaseed.Run()
+}
diff --git a/hack/cmd/orcaseed/orcaseed/client.go b/hack/cmd/orcaseed/orcaseed/client.go
new file mode 100644
index 00000000..473233b7
--- /dev/null
+++ b/hack/cmd/orcaseed/orcaseed/client.go
@@ -0,0 +1,102 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package orcaseed
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/bloberror"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container"
+)
+
+// azuriteWellKnownDevKey is the documented well-known shared key for
+// Azurite's default account ('devstoreaccount1'). It is a public
+// constant baked into Azurite, not a secret. Documented at
+// https://learn.microsoft.com/azure/storage/common/storage-use-azurite.
+const azuriteWellKnownDevKey = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="
+
+// globalFlags carries the connection-shape flags that every subcommand
+// honours. The defaults target the in-cluster Azurite emulator exposed
+// to the host via the dev harness's NodePort 30100.
+type globalFlags struct {
+	endpoint        string
+	account         string
+	accountKey      string
+	containerName   string
+	ensureContainer bool
+}
+
+func defaultGlobalFlags() *globalFlags {
+	return &globalFlags{
+		endpoint:        "http://localhost:30100/devstoreaccount1/",
+		account:         "devstoreaccount1",
+		accountKey:      azuriteWellKnownDevKey,
+		containerName:   "orca-test",
+		ensureContainer: true,
+	}
+}
+
+// newClients constructs the azblob service + container clients from
+// the global flags, applies the ensure-container behaviour if
+// requested, and returns the container client ready for blob
+// operations.
+func (g *globalFlags) newClients(ctx context.Context) (*azblob.Client, *container.Client, error) {
+	if g.endpoint == "" {
+		return nil, nil, fmt.Errorf("--endpoint is required")
+	}
+
+	if g.account == "" {
+		return nil, nil, fmt.Errorf("--account is required")
+	}
+
+	if g.accountKey == "" {
+		return nil, nil, fmt.Errorf("--account-key is required")
+	}
+
+	if g.containerName == "" {
+		return nil, nil, fmt.Errorf("--container is required")
+	}
+
+	cred, err := azblob.NewSharedKeyCredential(g.account, g.accountKey)
+	if err != nil {
+		return nil, nil, fmt.Errorf("shared-key credential: %w", err)
+	}
+	// Trim a trailing slash so containerURL concatenation produces
+	// the expected single-slash boundary.
+	endpoint := strings.TrimRight(g.endpoint, "/")
+
+	svc, err := azblob.NewClientWithSharedKeyCredential(endpoint, cred, nil)
+	if err != nil {
+		return nil, nil, fmt.Errorf("azblob client: %w", err)
+	}
+
+	cc := svc.ServiceClient().NewContainerClient(g.containerName)
+
+	if g.ensureContainer {
+		if err := ensureContainer(ctx, cc); err != nil {
+			return nil, nil, fmt.Errorf("ensure container %q: %w", g.containerName, err)
+		}
+	}
+
+	return svc, cc, nil
+}
+
+// ensureContainer creates the container if it does not exist.
+// ContainerAlreadyExists is treated as success so callers can invoke
+// this idempotently on every run.
+func ensureContainer(ctx context.Context, cc *container.Client) error {
+	_, err := cc.Create(ctx, nil)
+	if err == nil {
+		return nil
+	}
+
+	if bloberror.HasCode(err, bloberror.ContainerAlreadyExists) {
+		return nil
+	}
+
+	return err
+}
diff --git a/hack/cmd/orcaseed/orcaseed/delete.go b/hack/cmd/orcaseed/orcaseed/delete.go
new file mode 100644
index 00000000..47406b57
--- /dev/null
+++ b/hack/cmd/orcaseed/orcaseed/delete.go
@@ -0,0 +1,116 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package orcaseed
+
+import (
+	"bufio"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"strings"
+
+	"github.com/spf13/cobra"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container"
+)
+
+type deleteOpts struct {
+	prefix string
+	yes    bool
+}
+
+func newDeleteCmd(g *globalFlags) *cobra.Command {
+	o := &deleteOpts{}
+
+	cmd := &cobra.Command{
+		Use:   "delete",
+		Short: "Delete blobs from the container",
+		Long: `Delete removes every blob in the container whose name begins with
+--prefix (default: all blobs). Without --yes the command lists the
+matching set and prompts for confirmation on stdin.`,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runDelete(cmd.Context(), g, o)
+		},
+	}
+
+	cmd.Flags().StringVar(&o.prefix, "prefix", "",
+		"only delete blobs whose name begins with this prefix (empty = all)")
+	cmd.Flags().BoolVar(&o.yes, "yes", false,
+		"skip the interactive confirmation prompt")
+
+	return cmd
+}
+
+func runDelete(ctx context.Context, g *globalFlags, o *deleteOpts) error {
+	_, cc, err := g.newClients(ctx)
+	if err != nil {
+		return err
+	}
+
+	opts := &container.ListBlobsFlatOptions{}
+	if o.prefix != "" {
+		opts.Prefix = &o.prefix
+	}
+
+	var names []string
+
+	pager := cc.NewListBlobsFlatPager(opts)
+	for pager.More() {
+		page, err := pager.NextPage(ctx)
+		if err != nil {
+			return fmt.Errorf("list: %w", err)
+		}
+
+		for _, item := range page.Segment.BlobItems {
+			if item.Name != nil {
+				names = append(names, *item.Name)
+			}
+		}
+	}
+
+	if len(names) == 0 {
+		fmt.Fprintf(os.Stderr, "no matching blobs in container %q\n", g.containerName)
+		return nil
+	}
+
+	if !o.yes {
+		fmt.Fprintf(os.Stderr, "about to delete %d blob(s) from container %q:\n",
+			len(names), g.containerName)
+
+		for _, n := range names {
+			fmt.Fprintf(os.Stderr, "  %s\n", n)
+		}
+
+		fmt.Fprint(os.Stderr, "proceed? [y/N]: ")
+
+		r := bufio.NewReader(os.Stdin)
+
+		line, err := r.ReadString('\n')
+		if err != nil {
+			if errors.Is(err, io.EOF) {
+				return fmt.Errorf("delete confirmation: stdin closed without input; pass --yes to skip the prompt in non-interactive contexts")
+			}
+
+			return fmt.Errorf("read confirmation: %w", err)
+		}
+
+		if strings.ToLower(strings.TrimSpace(line)) != "y" {
+			fmt.Fprintln(os.Stderr, "aborted.")
+			return nil
+		}
+	}
+
+	for _, n := range names {
+		bc := cc.NewBlobClient(n)
+		if _, err := bc.Delete(ctx, nil); err != nil {
+			return fmt.Errorf("delete %s: %w", n, err)
+		}
+	}
+
+	fmt.Fprintf(os.Stderr, "deleted %d blobs from container %q\n", len(names), g.containerName)
+
+	return nil
+}
diff --git a/hack/cmd/orcaseed/orcaseed/generate.go b/hack/cmd/orcaseed/orcaseed/generate.go
new file mode 100644
index 00000000..ae388832
--- /dev/null
+++ b/hack/cmd/orcaseed/orcaseed/generate.go
@@ -0,0 +1,229 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package orcaseed
+
+import (
+	"context"
+	"crypto/rand"
+	"fmt"
+	"io"
+	mathrand "math/rand"
+	"os"
+	"sync/atomic"
+	"time"
+
+	"github.com/spf13/cobra"
+	"golang.org/x/sync/errgroup"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blockblob"
+)
+
+// generateOpts captures the per-command flags for the generate
+// subcommand. Defaults are conservative (1 MiB x 1 blob) so an
+// accidental invocation with no flags is harmless.
+type generateOpts struct {
+	sizeStr     string
+	count       int
+	prefix      string
+	seed        int64
+	concurrency int
+	force       bool
+}
+
+const (
+	// perBlobMax is the per-blob ceiling. Larger blobs require
+	// --force to acknowledge. Picked at 1 GiB to match the operator's
+	// stated cap and keep accidental "1TiB" typos from filling the
+	// emulator's emptyDir.
+	perBlobMax int64 = 1024 * 1024 * 1024
+	// totalWarn is the cumulative-bytes threshold above which the
+	// command logs a warning before proceeding. Sized to match
+	// perBlobMax for symmetry.
+	totalWarn int64 = 1024 * 1024 * 1024
+)
+
+func newGenerateCmd(g *globalFlags) *cobra.Command {
+	o := &generateOpts{
+		sizeStr:     "1MiB",
+		count:       1,
+		prefix:      "synth-",
+		concurrency: 4,
+	}
+
+	cmd := &cobra.Command{
+		Use:   "generate",
+		Short: "Generate N synthetic blobs of size S and upload them",
+		Long: `Generate creates --count blobs of --size random bytes each, named
+<prefix>0, <prefix>1, ... and uploads them to the configured
+container. Use --seed to make the byte stream reproducible across
+runs (useful when comparing cache behaviour between experiments).`,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runGenerate(cmd.Context(), g, o)
+		},
+	}
+
+	cmd.Flags().StringVar(&o.sizeStr, "size", o.sizeStr,
+		"per-blob size (e.g. 1MiB, 100MB, 1GiB)")
+	cmd.Flags().IntVar(&o.count, "count", o.count,
+		"number of blobs to generate")
+	cmd.Flags().StringVar(&o.prefix, "prefix", o.prefix,
+		"blob name prefix; blobs are named <prefix><index>")
+	cmd.Flags().Int64Var(&o.seed, "seed", o.seed,
+		"PRNG seed for deterministic content; 0 = use crypto/rand")
+	cmd.Flags().IntVar(&o.concurrency, "concurrency", o.concurrency,
+		"number of parallel uploads")
+	cmd.Flags().BoolVar(&o.force, "force", o.force,
+		"allow per-blob size > 1 GiB")
+
+	return cmd
+}
+
+func runGenerate(ctx context.Context, g *globalFlags, o *generateOpts) error {
+	if o.count < 1 {
+		return fmt.Errorf("--count must be >= 1")
+	}
+
+	if o.concurrency < 1 {
+		o.concurrency = 1
+	}
+
+	size, err := parseSize(o.sizeStr)
+	if err != nil {
+		return fmt.Errorf("--size: %w", err)
+	}
+
+	if size < 0 {
+		return fmt.Errorf("--size must be non-negative")
+	}
+
+	if size > perBlobMax && !o.force {
+		return fmt.Errorf("--size %s exceeds per-blob ceiling %s; pass --force to override",
+			formatSize(size), formatSize(perBlobMax))
+	}
+
+	total := size * int64(o.count)
+	if total > totalWarn {
+		fmt.Fprintf(os.Stderr, "warning: cumulative upload is %s (size %s x count %d); proceeding\n",
+			formatSize(total), formatSize(size), o.count)
+	}
+
+	_, cc, err := g.newClients(ctx)
+	if err != nil {
+		return err
+	}
+
+	fmt.Fprintf(os.Stderr, "generating %d blobs of %s (total %s) into container %q at %s\n",
+		o.count, formatSize(size), formatSize(total), g.containerName, g.endpoint)
+
+	var (
+		uploaded atomic.Int64
+		bytes    atomic.Int64
+	)
+
+	progressDone := make(chan struct{})
+
+	go func() {
+		defer close(progressDone)
+
+		t := time.NewTicker(500 * time.Millisecond)
+		defer t.Stop()
+
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-t.C:
+				done := uploaded.Load()
+				if done >= int64(o.count) {
+					return
+				}
+
+				fmt.Fprintf(os.Stderr, "  ... uploaded %d / %d (%s)\n",
+					done, o.count, formatSize(bytes.Load()))
+			}
+		}
+	}()
+
+	g2, gctx := errgroup.WithContext(ctx)
+	g2.SetLimit(o.concurrency)
+
+	for i := 0; i < o.count; i++ {
+		i := i
+
+		g2.Go(func() error {
+			name := fmt.Sprintf("%s%d", o.prefix, i)
+
+			body := newRandomReader(size, o.seed, int64(i))
+
+			bc := cc.NewBlockBlobClient(name)
+			if _, err := bc.UploadStream(gctx, body, &blockblob.UploadStreamOptions{}); err != nil {
+				return fmt.Errorf("upload %s: %w", name, err)
+			}
+
+			uploaded.Add(1)
+			bytes.Add(size)
+
+			return nil
+		})
+	}
+
+	if err := g2.Wait(); err != nil {
+		return err
+	}
+
+	<-progressDone
+
+	fmt.Fprintf(os.Stderr, "done: %d blobs, %s total\n", o.count, formatSize(bytes.Load()))
+
+	return nil
+}
+
+// newRandomReader returns an io.Reader producing exactly n bytes.
+// When userSeed == 0 the bytes come from crypto/rand (non-
+// deterministic, intended for typical seed-data workloads). When
+// userSeed != 0 the per-blob byte stream is derived from
+// math/rand.NewSource(userSeed + blobIndex), giving each blob its
+// own independent deterministic stream. The per-blob derivation is
+// what makes determinism survive --concurrency > 1: two invocations
+// of `orcaseed generate --seed 42 --count N --concurrency K`
+// produce byte-identical blobs regardless of upload-completion
+// ordering, because each blob's content is a pure function of
+// (userSeed, blobIndex).
+func newRandomReader(n, userSeed, blobIndex int64) io.Reader {
+	if userSeed == 0 {
+		return io.LimitReader(rand.Reader, n)
+	}
+
+	src := mathrand.NewSource(userSeed + blobIndex)
+
+	return &seededReader{rng: mathrand.New(src), remaining: n} //nolint:gosec // dev tool, deterministic-by-design
+}
+
+// seededReader produces exactly remaining bytes from a per-blob
+// math/rand source. The source is not shared, so no mutex is
+// required and reads do not block other goroutines.
+type seededReader struct {
+	rng       *mathrand.Rand
+	remaining int64
+}
+
+func (r *seededReader) Read(p []byte) (int, error) {
+	if r.remaining <= 0 {
+		return 0, io.EOF
+	}
+
+	want := int64(len(p))
+	if want > r.remaining {
+		want = r.remaining
+	}
+
+	n, _ := r.rng.Read(p[:want]) //nolint:errcheck // math/rand never errors
+
+	r.remaining -= int64(n)
+	if r.remaining == 0 {
+		return n, io.EOF
+	}
+
+	return n, nil
+}
diff --git a/hack/cmd/orcaseed/orcaseed/list.go b/hack/cmd/orcaseed/orcaseed/list.go
new file mode 100644
index 00000000..1e5ba309
--- /dev/null
+++ b/hack/cmd/orcaseed/orcaseed/list.go
@@ -0,0 +1,84 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package orcaseed
+
+import (
+	"context"
+	"fmt"
+	"os"
+
+	"github.com/spf13/cobra"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container"
+)
+
+type listOpts struct {
+	prefix string
+}
+
+func newListCmd(g *globalFlags) *cobra.Command {
+	o := &listOpts{}
+
+	cmd := &cobra.Command{
+		Use:   "list",
+		Short: "List blobs currently in the container",
+		Long: `List prints "<size>\t<name>" for each blob in the configured
+container, optionally filtered by --prefix.`,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runList(cmd.Context(), g, o)
+		},
+	}
+
+	cmd.Flags().StringVar(&o.prefix, "prefix", "",
+		"only list blobs whose name begins with this prefix")
+
+	return cmd
+}
+
+func runList(ctx context.Context, g *globalFlags, o *listOpts) error {
+	_, cc, err := g.newClients(ctx)
+	if err != nil {
+		return err
+	}
+
+	opts := &container.ListBlobsFlatOptions{}
+	if o.prefix != "" {
+		opts.Prefix = &o.prefix
+	}
+
+	pager := cc.NewListBlobsFlatPager(opts)
+
+	var (
+		count int
+		total int64
+	)
+
+	for pager.More() {
+		page, err := pager.NextPage(ctx)
+		if err != nil {
+			return fmt.Errorf("list: %w", err)
+		}
+
+		for _, item := range page.Segment.BlobItems {
+			name := ""
+			if item.Name != nil {
+				name = *item.Name
+			}
+
+			size := int64(0)
+			if item.Properties != nil && item.Properties.ContentLength != nil {
+				size = *item.Properties.ContentLength
+			}
+
+			fmt.Printf("%-12s\t%s\n", formatSize(size), name)
+
+			count++
+			total += size
+		}
+	}
+
+	fmt.Fprintf(os.Stderr, "(%d blobs, %s total)\n", count, formatSize(total))
+
+	return nil
+}
diff --git a/hack/cmd/orcaseed/orcaseed/orcaseed.go b/hack/cmd/orcaseed/orcaseed/orcaseed.go
new file mode 100644
index 00000000..d43ee29f
--- /dev/null
+++ b/hack/cmd/orcaseed/orcaseed/orcaseed.go
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package orcaseed implements the `orcaseed` developer tool used by
+// the Orca dev harness to populate the in-cluster Azurite origin
+// container with synthetic or operator-supplied content. Four
+// subcommands:
+//
+//	generate  - synthesise N blobs of size S each (random bytes;
+//	            optionally seeded for reproducibility).
+//	upload    - upload a single file from disk.
+//	list      - print the blobs currently in the container.
+//	delete    - remove blobs (optional --prefix filter).
+//
+// All subcommands share connection-shape flags (--endpoint,
+// --account, --account-key, --container) defaulting to the dev
+// harness's NodePort-exposed Azurite at localhost:30100. The
+// well-known Azurite dev key is the default --account-key value;
+// it is a public Microsoft-documented constant, not a secret.
+package orcaseed
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/spf13/cobra"
+)
+
+// Run is the entrypoint invoked by cmd/orcaseed/main.go. Wires the
+// cobra command tree, parses flags, dispatches to the chosen
+// subcommand. On error prints to stderr and exits non-zero.
+func Run() {
+	g := defaultGlobalFlags()
+
+	root := &cobra.Command{
+		Use:           "orcaseed",
+		Short:         "Populate the Orca dev-harness origin container",
+		SilenceUsage:  true,
+		SilenceErrors: false,
+	}
+
+	root.PersistentFlags().StringVar(&g.endpoint, "endpoint", g.endpoint,
+		"Azure Blob endpoint URL (path-style, account-included)")
+	root.PersistentFlags().StringVar(&g.account, "account", g.account,
+		"Storage account name")
+	root.PersistentFlags().StringVar(&g.accountKey, "account-key", g.accountKey,
+		"Shared key for the account (default: well-known Azurite dev key)")
+	root.PersistentFlags().StringVar(&g.containerName, "container", g.containerName,
+		"Container to operate against")
+	root.PersistentFlags().BoolVar(&g.ensureContainer, "ensure-container", g.ensureContainer,
+		"Create the container if it does not already exist")
+
+	root.AddCommand(newGenerateCmd(g))
+	root.AddCommand(newUploadCmd(g))
+	root.AddCommand(newListCmd(g))
+	root.AddCommand(newDeleteCmd(g))
+
+	if err := root.Execute(); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %v\n", err)
+		os.Exit(1)
+	}
+}
diff --git a/hack/cmd/orcaseed/orcaseed/orcaseed_test.go b/hack/cmd/orcaseed/orcaseed/orcaseed_test.go
new file mode 100644
index 00000000..4ff33766
--- /dev/null
+++ b/hack/cmd/orcaseed/orcaseed/orcaseed_test.go
@@ -0,0 +1,282 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package orcaseed
+
+import (
+	"context"
+	"encoding/base64"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"strings"
+	"sync/atomic"
+	"testing"
+)
+
+// TestParseSize covers every accepted suffix and the error paths.
+func TestParseSize(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		in      string
+		want    int64
+		wantErr bool
+	}{
+		{"1024", 1024, false},
+		{"0", 0, false},
+		{"1B", 1, false},
+		{"1KB", 1000, false},
+		{"1KiB", 1024, false},
+		{"10MB", 10_000_000, false},
+		{"10MiB", 10 * 1024 * 1024, false},
+		{"1GB", 1_000_000_000, false},
+		{"1GiB", 1024 * 1024 * 1024, false},
+		{"1TB", 1_000_000_000_000, false},
+		{"1TiB", 1024 * 1024 * 1024 * 1024, false},
+		{"1.5GB", 1_500_000_000, false},
+		{"  10MiB  ", 10 * 1024 * 1024, false},
+		{"10mib", 10 * 1024 * 1024, false},
+		{"", 0, true},
+		{"abc", 0, true},
+		{"1XB", 0, true},
+		{"-5MB", 0, true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.in, func(t *testing.T) {
+			got, err := parseSize(tt.in)
+			if tt.wantErr {
+				if err == nil {
+					t.Errorf("parseSize(%q) = %d, want error", tt.in, got)
+				}
+
+				return
+			}
+
+			if err != nil {
+				t.Errorf("parseSize(%q) unexpected err: %v", tt.in, err)
+				return
+			}
+
+			if got != tt.want {
+				t.Errorf("parseSize(%q) = %d, want %d", tt.in, got, tt.want)
+			}
+		})
+	}
+}
+
+// TestFormatSize spot-checks the human-readable rendering at the
+// boundaries between units.
+func TestFormatSize(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		in   int64
+		want string
+	}{
+		{0, "0 B"},
+		{512, "512 B"},
+		{1024, "1.00 KiB"},
+		{2048, "2.00 KiB"},
+		{1024 * 1024, "1.00 MiB"},
+		{10 * 1024 * 1024, "10.00 MiB"},
+		{1024 * 1024 * 1024, "1.00 GiB"},
+	}
+
+	for _, tt := range tests {
+		got := formatSize(tt.in)
+		if got != tt.want {
+			t.Errorf("formatSize(%d) = %q, want %q", tt.in, got, tt.want)
+		}
+	}
+}
+
+// TestGenerate_SeededDeterministic_Concurrent verifies that two
+// generate runs with the same --seed produce byte-identical bodies
+// even under concurrency > 1. The previous implementation used a
+// shared math/rand source serialised through a mutex; bytes flowed
+// to whichever goroutine acquired the lock first, so the same
+// invocation could produce different per-blob bytes between runs
+// based on goroutine-scheduling order. The fixed implementation
+// derives each blob's stream from (seed + blobIndex), so each blob
+// is a pure function of its index and seed regardless of
+// completion ordering.
+//
+// Regression for C-6.
+func TestGenerate_SeededDeterministic_Concurrent(t *testing.T) {
+	t.Parallel()
+
+	bodiesA := startFakeAzurite(t)
+	defer bodiesA.close()
+
+	bodiesB := startFakeAzurite(t)
+	defer bodiesB.close()
+
+	g := defaultGlobalFlags()
+	g.endpoint = bodiesA.url
+	g.account = "devstoreaccount1"
+	g.accountKey = base64.StdEncoding.EncodeToString([]byte("test-shared-key-placeholder--32b"))
+	g.containerName = "ctr"
+
+	o := &generateOpts{
+		sizeStr:     "4KiB",
+		count:       4,
+		prefix:      "synth-",
+		seed:        42,
+		concurrency: 4, // deliberate: prove determinism survives parallel uploads
+	}
+
+	if err := runGenerate(context.Background(), g, o); err != nil {
+		t.Fatalf("first runGenerate: %v", err)
+	}
+
+	g.endpoint = bodiesB.url
+
+	if err := runGenerate(context.Background(), g, o); err != nil {
+		t.Fatalf("second runGenerate: %v", err)
+	}
+
+	for _, name := range []string{"synth-0", "synth-1", "synth-2", "synth-3"} {
+		a := bodiesA.get(name)
+		b := bodiesB.get(name)
+
+		if len(a) == 0 {
+			t.Errorf("blob %q missing from first run", name)
+			continue
+		}
+
+		if len(a) != len(b) {
+			t.Errorf("blob %q length differs across runs: %d vs %d", name, len(a), len(b))
+			continue
+		}
+
+		if string(a) != string(b) {
+			t.Errorf("blob %q bytes differ across two seeded runs (concurrency=%d)",
+				name, o.concurrency)
+		}
+	}
+}
+
+// TestGenerate_SeededDifferentBlobsHaveDifferentContent verifies the
+// per-blob seeding produces distinct streams (so two blobs in the
+// same run are not byte-identical).
+func TestGenerate_SeededDifferentBlobsHaveDifferentContent(t *testing.T) {
+	t.Parallel()
+
+	bodies := startFakeAzurite(t)
+	defer bodies.close()
+
+	g := defaultGlobalFlags()
+	g.endpoint = bodies.url
+	g.account = "devstoreaccount1"
+	g.accountKey = base64.StdEncoding.EncodeToString([]byte("test-shared-key-placeholder--32b"))
+	g.containerName = "ctr"
+
+	o := &generateOpts{
+		sizeStr:     "4KiB",
+		count:       2,
+		prefix:      "synth-",
+		seed:        99,
+		concurrency: 2,
+	}
+
+	if err := runGenerate(context.Background(), g, o); err != nil {
+		t.Fatalf("runGenerate: %v", err)
+	}
+
+	a := bodies.get("synth-0")
+	b := bodies.get("synth-1")
+
+	if len(a) == 0 || len(b) == 0 {
+		t.Fatalf("blobs missing: synth-0=%d synth-1=%d", len(a), len(b))
+	}
+
+	if string(a) == string(b) {
+		t.Errorf("synth-0 and synth-1 have identical content; per-blob seeding broken")
+	}
+}
+
+// fakeAzurite is a minimal httptest-backed server that:
+//   - accepts container Create (PUT ?restype=container) with 201;
+//   - accepts block-blob PUT at /<account>/<container>/<blob> with 201;
+//   - records received bodies indexed by blob name;
+//   - rejects everything else with 400 so test failures are loud.
+type fakeAzurite struct {
+	srv      *httptest.Server
+	url      string
+	mu       atomic.Pointer[map[string][]byte]
+	requests atomic.Int64
+}
+
+func startFakeAzurite(t *testing.T) *fakeAzurite {
+	t.Helper()
+
+	f := &fakeAzurite{}
+	bodies := make(map[string][]byte)
+	f.mu.Store(&bodies)
+
+	mux := http.NewServeMux()
+	mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+		f.requests.Add(1)
+		// path: /<account>/<container>[/<blob>]
+		// We don't validate the SAS / shared-key signature; the SDK
+		// signs every request and we trust the format.
+		path := strings.TrimPrefix(r.URL.Path, "/")
+
+		parts := strings.SplitN(path, "/", 3)
+		if len(parts) < 2 {
+			http.Error(w, "bad path", http.StatusBadRequest)
+			return
+		}
+		// Container create: PUT /<account>/<container>?restype=container
+		if r.Method == http.MethodPut && len(parts) == 2 && r.URL.Query().Get("restype") == "container" {
+			w.WriteHeader(http.StatusCreated)
+			return
+		}
+
+		if r.Method == http.MethodPut && len(parts) == 3 {
+			body, _ := io.ReadAll(r.Body) //nolint:errcheck // best-effort test reader
+			_ = r.Body.Close()            //nolint:errcheck // best-effort
+
+			cur := *f.mu.Load()
+			next := make(map[string][]byte, len(cur)+1)
+
+			for k, v := range cur {
+				next[k] = v
+			}
+
+			next[parts[2]] = body
+			f.mu.Store(&next)
+
+			w.Header().Set("ETag", "\"fake-etag\"")
+			w.Header().Set("Last-Modified", "Thu, 01 Jan 1970 00:00:00 GMT")
+			w.WriteHeader(http.StatusCreated)
+
+			return
+		}
+
+		http.Error(w, "unexpected request: "+r.Method+" "+r.URL.String(), http.StatusBadRequest)
+	})
+
+	f.srv = httptest.NewServer(mux)
+	// Account-suffixed endpoint shape the SDK expects.
+	f.url = f.srv.URL + "/devstoreaccount1/"
+
+	// Validate the URL parses cleanly.
+	if _, err := url.Parse(f.url); err != nil {
+		t.Fatalf("fake azurite endpoint parse: %v", err)
+	}
+
+	return f
+}
+
+func (f *fakeAzurite) close() {
+	f.srv.Close()
+}
+
+func (f *fakeAzurite) get(name string) []byte {
+	cur := *f.mu.Load()
+	return cur[name]
+}
diff --git a/hack/cmd/orcaseed/orcaseed/size.go b/hack/cmd/orcaseed/orcaseed/size.go
new file mode 100644
index 00000000..4ea835f5
--- /dev/null
+++ b/hack/cmd/orcaseed/orcaseed/size.go
@@ -0,0 +1,109 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package orcaseed
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+// parseSize converts a human-readable size string into a byte count.
+// Supports the following suffixes (case-insensitive): B, KB, KiB, MB,
+// MiB, GB, GiB, TB, TiB. Decimal suffixes (KB, MB, ...) use base 1000;
+// binary suffixes (KiB, MiB, ...) use base 1024. Bare numbers are
+// interpreted as bytes.
+//
+// Examples:
+//
+//	"1024"   -> 1024
+//	"1KB"    -> 1000
+//	"1KiB"   -> 1024
+//	"10MiB"  -> 10485760
+//	"1.5GB"  -> 1500000000
+func parseSize(s string) (int64, error) {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return 0, fmt.Errorf("empty size string")
+	}
+	// Walk forward to find the numeric / suffix split.
+	i := 0
+	for i < len(s) {
+		c := s[i]
+		if (c >= '0' && c <= '9') || c == '.' {
+			i++
+			continue
+		}
+
+		break
+	}
+
+	if i == 0 {
+		return 0, fmt.Errorf("size %q has no numeric prefix", s)
+	}
+
+	numStr := s[:i]
+	suffix := strings.ToLower(strings.TrimSpace(s[i:]))
+
+	num, err := strconv.ParseFloat(numStr, 64)
+	if err != nil {
+		return 0, fmt.Errorf("invalid number %q: %w", numStr, err)
+	}
+
+	if num < 0 {
+		return 0, fmt.Errorf("size must be non-negative, got %s", numStr)
+	}
+
+	var mult int64
+
+	switch suffix {
+	case "", "b":
+		mult = 1
+	case "k", "kb":
+		mult = 1000
+	case "ki", "kib":
+		mult = 1024
+	case "m", "mb":
+		mult = 1000 * 1000
+	case "mi", "mib":
+		mult = 1024 * 1024
+	case "g", "gb":
+		mult = 1000 * 1000 * 1000
+	case "gi", "gib":
+		mult = 1024 * 1024 * 1024
+	case "t", "tb":
+		mult = 1000 * 1000 * 1000 * 1000
+	case "ti", "tib":
+		mult = 1024 * 1024 * 1024 * 1024
+	default:
+		return 0, fmt.Errorf("size %q has unrecognized suffix %q (want B, KB/KiB, MB/MiB, GB/GiB, TB/TiB)", s, suffix)
+	}
+
+	return int64(num * float64(mult)), nil
+}
+
+// formatSize renders a byte count as a human-friendly string using
+// binary suffixes (KiB, MiB, GiB). Used in progress and summary
+// output where readability matters more than precision.
+func formatSize(n int64) string {
+	const (
+		kib int64 = 1024
+		mib int64 = 1024 * kib
+		gib int64 = 1024 * mib
+		tib int64 = 1024 * gib
+	)
+
+	switch {
+	case n >= tib:
+		return fmt.Sprintf("%.2f TiB", float64(n)/float64(tib))
+	case n >= gib:
+		return fmt.Sprintf("%.2f GiB", float64(n)/float64(gib))
+	case n >= mib:
+		return fmt.Sprintf("%.2f MiB", float64(n)/float64(mib))
+	case n >= kib:
+		return fmt.Sprintf("%.2f KiB", float64(n)/float64(kib))
+	default:
+		return fmt.Sprintf("%d B", n)
+	}
+}
diff --git a/hack/cmd/orcaseed/orcaseed/upload.go b/hack/cmd/orcaseed/orcaseed/upload.go
new file mode 100644
index 00000000..f746a26d
--- /dev/null
+++ b/hack/cmd/orcaseed/orcaseed/upload.go
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package orcaseed
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/spf13/cobra"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blockblob"
+)
+
+type uploadOpts struct {
+	file string
+	name string
+}
+
+func newUploadCmd(g *globalFlags) *cobra.Command {
+	o := &uploadOpts{}
+
+	cmd := &cobra.Command{
+		Use:   "upload",
+		Short: "Upload a single file from disk into the container",
+		Long: `Upload reads --file from local disk and stores it in the configured
+container under --name (default: filepath.Base(--file)). The
+upload streams in chunks; very large files don't buffer in memory.`,
+		RunE: func(cmd *cobra.Command, _ []string) error {
+			return runUpload(cmd.Context(), g, o)
+		},
+	}
+
+	cmd.Flags().StringVar(&o.file, "file", "", "local file to upload (required)")
+	cmd.Flags().StringVar(&o.name, "name", "",
+		"destination blob name (default: basename of --file)")
+
+	return cmd
+}
+
+func runUpload(ctx context.Context, g *globalFlags, o *uploadOpts) error {
+	if o.file == "" {
+		return fmt.Errorf("--file is required")
+	}
+
+	st, err := os.Stat(o.file)
+	if err != nil {
+		return fmt.Errorf("stat --file: %w", err)
+	}
+
+	if st.IsDir() {
+		return fmt.Errorf("--file %q is a directory; only single files are supported", o.file)
+	}
+
+	name := o.name
+	if name == "" {
+		name = filepath.Base(o.file)
+	}
+
+	_, cc, err := g.newClients(ctx)
+	if err != nil {
+		return err
+	}
+
+	f, err := os.Open(o.file)
+	if err != nil {
+		return fmt.Errorf("open --file: %w", err)
+	}
+
+	defer f.Close() //nolint:errcheck // upload tool, file close best-effort on success path
+
+	fmt.Fprintf(os.Stderr, "uploading %s (%s) -> %s/%s\n",
+		o.file, formatSize(st.Size()), g.containerName, name)
+
+	bc := cc.NewBlockBlobClient(name)
+	if _, err := bc.UploadStream(ctx, f, &blockblob.UploadStreamOptions{}); err != nil {
+		return fmt.Errorf("upload: %w", err)
+	}
+
+	fmt.Fprintf(os.Stderr, "done.\n")
+
+	return nil
+}
diff --git a/hack/cmd/render-manifests/main.go b/hack/cmd/render-manifests/main.go
index 475c7129..187676fa 100644
--- a/hack/cmd/render-manifests/main.go
+++ b/hack/cmd/render-manifests/main.go
@@ -10,19 +10,19 @@
 // evaluate to empty strings (text/template's missingkey=zero behaviour for map
 // data), which lets templates rely on sprig's `default` function to supply
 // documented fallbacks.
+//
+// The actual rendering logic lives in the render sub-package so it can be
+// invoked programmatically from tests.
 package main
 
 import (
-	"bytes"
 	"flag"
 	"fmt"
 	"os"
-	"path/filepath"
 	"sort"
 	"strings"
-	"text/template"
 
-	"github.com/Masterminds/sprig/v3"
+	"github.com/Azure/unbounded/hack/cmd/render-manifests/render"
 )
 
 // setFlags implements flag.Value for repeatable --set key=value arguments.
@@ -75,60 +75,11 @@ func main() {
 		exitWithError("--output-dir is required")
 	}
 
-	if err := renderTemplates(templatesDir, outputDir, data); err != nil {
+	if err := render.Render(templatesDir, outputDir, data); err != nil {
 		exitWithError(err.Error())
 	}
 }
 
-func renderTemplates(templatesDir, outputDir string, data setFlags) error {
-	return filepath.WalkDir(templatesDir, func(path string, d os.DirEntry, err error) error {
-		if err != nil {
-			return err
-		}
-
-		if d.IsDir() {
-			return nil
-		}
-
-		if !strings.HasSuffix(path, ".yaml.tmpl") {
-			return nil
-		}
-
-		relPath, err := filepath.Rel(templatesDir, path)
-		if err != nil {
-			return err
-		}
-
-		outputRelPath := strings.TrimSuffix(relPath, ".tmpl")
-		outputPath := filepath.Join(outputDir, outputRelPath)
-
-		templateBytes, err := os.ReadFile(path)
-		if err != nil {
-			return fmt.Errorf("read template %q: %w", path, err)
-		}
-
-		tmpl, err := template.New(relPath).Funcs(sprig.TxtFuncMap()).Option("missingkey=zero").Parse(string(templateBytes))
-		if err != nil {
-			return fmt.Errorf("parse template %q: %w", path, err)
-		}
-
-		if err := os.MkdirAll(filepath.Dir(outputPath), 0o755); err != nil {
-			return fmt.Errorf("create output dir for %q: %w", outputPath, err)
-		}
-
-		var rendered bytes.Buffer
-		if err := tmpl.Execute(&rendered, map[string]string(data)); err != nil {
-			return fmt.Errorf("execute template %q: %w", path, err)
-		}
-
-		if err := os.WriteFile(outputPath, rendered.Bytes(), 0o644); err != nil {
-			return fmt.Errorf("write rendered manifest %q: %w", outputPath, err)
-		}
-
-		return nil
-	})
-}
-
 func exitWithError(message string) {
 	fmt.Fprintln(os.Stderr, message)
 	os.Exit(1)
diff --git a/hack/cmd/render-manifests/render/render.go b/hack/cmd/render-manifests/render/render.go
new file mode 100644
index 00000000..13d3dce5
--- /dev/null
+++ b/hack/cmd/render-manifests/render/render.go
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package render implements the manifest template renderer used by
+// the render-manifests CLI. Exposed as a package so tests in other
+// packages (e.g. internal/orca/manifests) can render the orca
+// templates programmatically without shelling out to `go run`.
+package render
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"text/template"
+
+	"github.com/Masterminds/sprig/v3"
+)
+
+// Render walks templatesDir for *.yaml.tmpl files, executes each with
+// Go's text/template (plus the sprig function library), and writes
+// the rendered output under outputDir mirroring the source tree.
+//
+// Template data is supplied via the data map. Missing keys evaluate
+// to empty strings (text/template's missingkey=zero), which lets
+// templates rely on sprig's `default` function for fallbacks.
+func Render(templatesDir, outputDir string, data map[string]string) error {
+	return filepath.WalkDir(templatesDir, func(path string, d os.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+
+		if d.IsDir() {
+			return nil
+		}
+
+		if !strings.HasSuffix(path, ".yaml.tmpl") {
+			return nil
+		}
+
+		relPath, err := filepath.Rel(templatesDir, path)
+		if err != nil {
+			return err
+		}
+
+		outputRelPath := strings.TrimSuffix(relPath, ".tmpl")
+		outputPath := filepath.Join(outputDir, outputRelPath)
+
+		templateBytes, err := os.ReadFile(path)
+		if err != nil {
+			return fmt.Errorf("read template %q: %w", path, err)
+		}
+
+		tmpl, err := template.New(relPath).Funcs(sprig.TxtFuncMap()).Option("missingkey=zero").Parse(string(templateBytes))
+		if err != nil {
+			return fmt.Errorf("parse template %q: %w", path, err)
+		}
+
+		if err := os.MkdirAll(filepath.Dir(outputPath), 0o755); err != nil {
+			return fmt.Errorf("create output dir for %q: %w", outputPath, err)
+		}
+
+		var rendered bytes.Buffer
+		if err := tmpl.Execute(&rendered, data); err != nil {
+			return fmt.Errorf("execute template %q: %w", path, err)
+		}
+
+		if err := os.WriteFile(outputPath, rendered.Bytes(), 0o644); err != nil {
+			return fmt.Errorf("write rendered manifest %q: %w", outputPath, err)
+		}
+
+		return nil
+	})
+}
diff --git a/hack/orca/.gitignore b/hack/orca/.gitignore
new file mode 100644
index 00000000..e19a8c5e
--- /dev/null
+++ b/hack/orca/.gitignore
@@ -0,0 +1,3 @@
+# Dev-only artifacts; never committed.
+rendered-dev/
+.env
diff --git a/hack/orca/Makefile b/hack/orca/Makefile
new file mode 100644
index 00000000..92f0f171
--- /dev/null
+++ b/hack/orca/Makefile
@@ -0,0 +1,310 @@
+# hack/orca/Makefile - dev-harness targets for the Orca origin cache.
+#
+# Invoke from the repo root: `make -C hack/orca <target>`. The root
+# Makefile also defines `orca-up`, `orca-down`, `orca-reset` which
+# proxy here.
+#
+# These targets stand up a local Kind cluster, build the Orca container
+# image with podman, side-load it into Kind, deploy LocalStack as the
+# cachestore backend, and apply the rendered Orca manifests. The
+# harness validates the Kubernetes deployment shape (manifests, image,
+# headless-Service DNS, RBAC, init-Job ordering); for Go-level
+# behavior coverage use `make orca-inttest` which runs the in-process
+# integration suite under internal/orca/inttest/.
+
+REPO_ROOT  := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))/../..)
+HACK_DIR   := $(dir $(lastword $(MAKEFILE_LIST)))
+
+# Cluster + namespace knobs.
+CLUSTER_NAME      ?= orca-dev
+NAMESPACE         ?= unbounded-kube
+KIND_CONFIG       ?= $(HACK_DIR)kind-config.yaml
+
+# Image tag pinned to :dev so kind load and rollout-restart use a
+# stable identifier (the auto-derived VERSION can include slashes from
+# git tags like images/agent-ubuntu2404-nvidia/v..., which are illegal
+# in OCI tags).
+ORCA_VERSION      ?= dev
+ORCA_IMAGE        ?= ghcr.io/azure/orca:$(ORCA_VERSION)
+
+# Container engine (podman in CI, podman or docker locally). kind load
+# image-archive accepts an OCI tarball produced by either.
+CONTAINER_ENGINE  ?= podman
+
+# Path to user .env (sourced by helper scripts that need it).
+ENV_FILE          ?= $(HACK_DIR).env
+
+# Rendered manifest dirs (per-Makefile target overrides for the dev
+# rendering of pluggable orca manifests + the dev-only LocalStack/init
+# manifests).
+ORCA_RENDERED     := $(REPO_ROOT)/deploy/orca/rendered
+DEV_TEMPLATES     := $(REPO_ROOT)/deploy/orca/dev
+DEV_RENDERED      := $(HACK_DIR)rendered-dev
+
+.PHONY: help up down reset render render-dev image kind-create kind-load \
+        deploy deploy-localstack deploy-azurite deploy-azurite-maybe \
+        deploy-credentials deploy-orca \
+        wait-ready logs port-forward seed-azure status \
+        seed-generate seed-upload seed-list seed-delete
+
+help: ## Show this help
+	@echo ""
+	@echo "Usage: make -C hack/orca <target> [VAR=value ...]"
+	@echo ""
+	@echo "Lifecycle:"
+	@echo "  up                  Bring up Kind cluster + LocalStack + Orca"
+	@echo "  down                Delete Kind cluster"
+	@echo "  reset               Rebuild image + rollout-restart deployment"
+	@echo ""
+	@echo "Pieces (typically called by 'up'):"
+	@echo "  render              Render orca manifests"
+	@echo "  render-dev          Render dev-only manifests (LocalStack, init job)"
+	@echo "  image               Build Orca container image (image-orca-local)"
+	@echo "  kind-create         Create the Kind cluster (idempotent)"
+	@echo "  kind-load           Load the Orca image into Kind nodes"
+	@echo "  deploy-localstack   Apply LocalStack Deployment + bucket init Job"
+	@echo "  deploy-credentials  Create the orca-credentials Secret from .env"
+	@echo "  deploy-orca         Apply rendered Orca manifests"
+	@echo "  wait-ready          Block until 3/3 orca pods are Ready"
+	@echo ""
+	@echo "Operate:"
+	@echo "  status              kubectl get pods -n $(NAMESPACE)"
+	@echo "  logs                Tail logs from all Orca pods"
+	@echo "  port-forward        Forward localhost:8443 -> svc/orca"
+	@echo "  seed-azure          Upload a file to real Azure (FILE=path; requires .env creds)"
+	@echo ""
+	@echo "Seed origin (Azurite via NodePort 30100; needs cluster up + ORIGIN_DRIVER=azureblob):"
+	@echo "  seed-generate SEED_ARGS='--size 10MiB --count 5'  Synthesise N blobs of size S"
+	@echo "  seed-upload   FILE=/path/to/file                   Upload a single file"
+	@echo "  seed-list     [SEED_ARGS='--prefix foo']           List blobs in the container"
+	@echo "  seed-delete   [PREFIX=foo] [SEED_ARGS='--yes']     Delete blobs (interactive by default)"
+	@echo ""
+	@echo "Note: For Go-level behavior testing (chunked GETs, cluster routing,"
+	@echo "singleflight, peer fallback) use 'make orca-inttest' from the repo"
+	@echo "root. That suite exercises the same code paths against testcontainers"
+	@echo "without needing Kind."
+	@echo ""
+	@echo "Variables:"
+	@echo "  CLUSTER_NAME=$(CLUSTER_NAME)"
+	@echo "  NAMESPACE=$(NAMESPACE)"
+	@echo "  ORCA_IMAGE=$(ORCA_IMAGE)"
+	@echo "  CONTAINER_ENGINE=$(CONTAINER_ENGINE)"
+	@echo "  ENV_FILE=$(ENV_FILE)"
+
+# -- Top-level lifecycle ------------------------------------------------------
+
+up: kind-create image kind-load deploy ## End-to-end bring-up
+
+down: ## Delete Kind cluster
+	CLUSTER_NAME="$(CLUSTER_NAME)" $(HACK_DIR)down.sh
+
+reset: image kind-load ## Rebuild image and rolling-restart Orca
+	kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) rollout restart deployment/orca
+	kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) rollout status deployment/orca --timeout=120s
+
+# `deploy` deploys whichever origin backend matches ORIGIN_DRIVER in
+# .env (default: awss3 -> LocalStack only; azureblob also brings up
+# Azurite). The cachestore is always LocalStack regardless. Init Jobs
+# are idempotent so re-applying is safe.
+deploy: render render-dev deploy-localstack deploy-azurite-maybe deploy-credentials deploy-orca wait-ready ## Apply all manifests + Secret
+
+deploy-azurite-maybe: render-dev
+	@if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \
+	driver="$${ORIGIN_DRIVER:-awss3}"; \
+	if [ "$$driver" = "azureblob" ]; then \
+		echo "ORIGIN_DRIVER=azureblob -> deploying Azurite"; \
+		$(MAKE) deploy-azurite; \
+	else \
+		echo "ORIGIN_DRIVER=$$driver -> Azurite not required (skipping)"; \
+	fi
+
+# -- Rendering ----------------------------------------------------------------
+
+# Render the pluggable orca manifests with the dev image. Default
+# origin driver in the dev harness is awss3 pointing at the same
+# in-cluster LocalStack instance (different bucket); reviewers can
+# override by setting ORIGIN_DRIVER=azureblob and the appropriate
+# AZURE_* values in .env. Credentials are NOT rendered into the
+# ConfigMap; they ride in via the orca-credentials Secret as env vars
+# (envFrom).
+render:
+	@echo "Rendering orca manifests with image=$(ORCA_IMAGE)"
+	@mkdir -p "$(ORCA_RENDERED)"
+	@find "$(ORCA_RENDERED)" -mindepth 1 -not -name .gitignore -delete 2>/dev/null || true
+	@if [ -f "$(ENV_FILE)" ]; then \
+		set -a && . "$(ENV_FILE)" && set +a; \
+	fi; \
+	driver="$${ORIGIN_DRIVER:-awss3}"; \
+	if [ "$$driver" = "azureblob" ]; then \
+		azure_account="$${AZURE_STORAGE_ACCOUNT:-devstoreaccount1}"; \
+		azure_container="$${AZURE_CONTAINER:-$${AZURITE_CONTAINER:-orca-test}}"; \
+		azure_endpoint="$${AZUREBLOB_ENDPOINT:-http://azurite.$(NAMESPACE).svc.cluster.local:10000/devstoreaccount1/}"; \
+	else \
+		azure_account="$${AZURE_STORAGE_ACCOUNT:-}"; \
+		azure_container="$${AZURE_CONTAINER:-orca-test}"; \
+		azure_endpoint="$${AZUREBLOB_ENDPOINT:-}"; \
+	fi; \
+	go run "$(REPO_ROOT)/hack/cmd/render-manifests" \
+		--templates-dir "$(REPO_ROOT)/deploy/orca" \
+		--output-dir "$(ORCA_RENDERED)" \
+		--set Namespace="$(NAMESPACE)" \
+		--set Image="$(ORCA_IMAGE)" \
+		--set ImagePullPolicy=IfNotPresent \
+		--set TargetReplicas="$${TARGET_REPLICAS:-3}" \
+		--set OriginID="$${ORIGIN_ID:-awss3-localstack}" \
+		--set OriginDriver="$$driver" \
+		--set AzureAccount="$$azure_account" \
+		--set AzureContainer="$$azure_container" \
+		--set AzureEndpoint="$$azure_endpoint" \
+		--set OriginAWSS3Endpoint="$${ORIGIN_AWSS3_ENDPOINT:-http://localstack.$(NAMESPACE).svc.cluster.local:4566}" \
+		--set OriginAWSS3Region="$${ORIGIN_AWSS3_REGION:-us-east-1}" \
+		--set OriginAWSS3Bucket="$${ORIGIN_AWSS3_BUCKET:-orca-origin}" \
+		--set OriginAWSS3UsePathStyle="true" \
+		--set CachestoreBucket="$${CACHESTORE_BUCKET:-orca-cache}" \
+		--set CachestoreEndpoint="$${CACHESTORE_ENDPOINT:-http://localstack.$(NAMESPACE).svc.cluster.local:4566}" \
+		--set CachestoreRegion="$${CACHESTORE_REGION:-us-east-1}" \
+		--set ClusterService="orca-peers.$(NAMESPACE).svc.cluster.local" \
+		--set ServerAuthEnabled=false \
+		--set InternalTLSEnabled=false \
+		--set LogLevel="$${LOG_LEVEL:-info}"
+
+render-dev:
+	@echo "Rendering dev manifests (LocalStack, init job, Azurite)"
+	@mkdir -p "$(DEV_RENDERED)"
+	@find "$(DEV_RENDERED)" -mindepth 1 -delete 2>/dev/null || true
+	@if [ -f "$(ENV_FILE)" ]; then \
+		set -a && . "$(ENV_FILE)" && set +a; \
+	fi; \
+	go run "$(REPO_ROOT)/hack/cmd/render-manifests" \
+		--templates-dir "$(DEV_TEMPLATES)" \
+		--output-dir "$(DEV_RENDERED)" \
+		--set Namespace="$(NAMESPACE)" \
+		--set CachestoreBucket="$${CACHESTORE_BUCKET:-orca-cache}" \
+		--set OriginBucket="$${ORIGIN_AWSS3_BUCKET:-orca-origin}" \
+		--set AzuriteContainer="$${AZURE_CONTAINER:-$${AZURITE_CONTAINER:-orca-test}}" \
+		--set AzuriteNodePort="$${AZURITE_NODE_PORT:-30100}"
+
+# -- Image + cluster ----------------------------------------------------------
+
+image:
+	@echo "Building Orca image $(ORCA_IMAGE) with $(CONTAINER_ENGINE)"
+	cd "$(REPO_ROOT)" && $(MAKE) image-orca-local \
+		VERSION=$(ORCA_VERSION) \
+		CONTAINER_ENGINE=$(CONTAINER_ENGINE) \
+		ORCA_IMAGE=$(ORCA_IMAGE)
+
+kind-create:
+	CLUSTER_NAME="$(CLUSTER_NAME)" KIND_CONFIG="$(KIND_CONFIG)" $(HACK_DIR)kind-create.sh
+
+kind-load:
+	CLUSTER_NAME="$(CLUSTER_NAME)" \
+	ORCA_IMAGE="$(ORCA_IMAGE)" \
+	CONTAINER_ENGINE="$(CONTAINER_ENGINE)" \
+		$(HACK_DIR)kind-load.sh
+
+# -- Deploy steps -------------------------------------------------------------
+
+deploy-localstack: render-dev render
+	kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/01-namespace.yaml"
+	kubectl --context kind-$(CLUSTER_NAME) apply -f "$(DEV_RENDERED)/01-localstack.yaml"
+	kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) rollout status deployment/localstack --timeout=120s
+	kubectl --context kind-$(CLUSTER_NAME) apply -f "$(DEV_RENDERED)/02-init-job.yaml"
+	kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) wait --for=condition=complete job/orca-buckets-init --timeout=120s
+
+deploy-azurite: render-dev
+	kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/01-namespace.yaml"
+	kubectl --context kind-$(CLUSTER_NAME) apply -f "$(DEV_RENDERED)/03-azurite.yaml"
+	kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) rollout status deployment/azurite --timeout=180s
+	kubectl --context kind-$(CLUSTER_NAME) apply -f "$(DEV_RENDERED)/04-azurite-init.yaml"
+	kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) wait --for=condition=complete job/orca-azurite-container-init --timeout=180s
+
+deploy-credentials:
+	CLUSTER_NAME="$(CLUSTER_NAME)" \
+	NAMESPACE="$(NAMESPACE)" \
+	ENV_FILE="$(ENV_FILE)" \
+		$(HACK_DIR)deploy-credentials.sh
+
+deploy-orca: render
+	kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/02-rbac.yaml"
+	kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/03-config.yaml"
+	# Service before Deployment: the headless orca-peers Service must
+	# exist (with its DNS A-records) before the pods start so the
+	# initial cluster.refresh sees the full peer set instead of
+	# bootstrapping into the self-only fallback.
+	kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/05-service.yaml"
+	kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/04-deployment.yaml"
+
+wait-ready:
+	kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) rollout status deployment/orca --timeout=180s
+
+# -- Operate ------------------------------------------------------------------
+
+status:
+	kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) get pods -o wide
+
+logs:
+	kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) logs -l app.kubernetes.io/name=orca --tail=200 -f
+
+port-forward:
+	@echo "Forwarding localhost:8443 -> svc/orca:8443 ..."
+	kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) port-forward svc/orca 8443:8443
+
+seed-azure: ## Upload a file to real Azure (requires AZURE_STORAGE_* in .env; pass FILE=...)
+	@[ -n "$(FILE)" ] || { echo "Usage: make seed-azure FILE=/path/to/file [SEED_ARGS='--name foo']" >&2; exit 1; }
+	@if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \
+	[ -n "$${AZURE_STORAGE_ACCOUNT:-}" ] || { echo "AZURE_STORAGE_ACCOUNT not set in $(ENV_FILE)" >&2; exit 1; }; \
+	[ -n "$${AZURE_STORAGE_KEY:-}" ]     || { echo "AZURE_STORAGE_KEY not set in $(ENV_FILE)"     >&2; exit 1; }; \
+	[ -n "$${AZURE_CONTAINER:-}" ]       || { echo "AZURE_CONTAINER not set in $(ENV_FILE)"       >&2; exit 1; }; \
+	go run "$(REPO_ROOT)/hack/cmd/orcaseed" upload \
+		--endpoint "https://$${AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/" \
+		--account "$${AZURE_STORAGE_ACCOUNT}" \
+		--account-key "$${AZURE_STORAGE_KEY}" \
+		--container "$${AZURE_CONTAINER}" \
+		--file "$(FILE)" \
+		$(SEED_ARGS)
+
+# -- Seeder (orcaseed) helpers ------------------------------------------------
+#
+# These targets invoke hack/cmd/orcaseed against the in-cluster Azurite
+# emulator exposed on the host loopback via the NodePort 30100 baked
+# into deploy/orca/dev/03-azurite.yaml.tmpl. Override AZURITE_NODE_PORT
+# in .env if you've bumped the NodePort to avoid a host-port conflict.
+# Pass extra flags via SEED_ARGS, e.g.:
+#
+#   make -C hack/orca seed-generate SEED_ARGS='--size 10MiB --count 5'
+#   make -C hack/orca seed-upload FILE=~/data.tar.gz
+#   make -C hack/orca seed-list
+#   make -C hack/orca seed-delete PREFIX=synth- SEED_ARGS='--yes'
+
+SEED_ENDPOINT ?= http://localhost:$${AZURITE_NODE_PORT:-30100}/devstoreaccount1/
+
+seed-generate: ## Generate synthetic blobs and upload to the Azurite origin
+	@if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \
+	go run "$(REPO_ROOT)/hack/cmd/orcaseed" generate \
+		--endpoint "$(SEED_ENDPOINT)" \
+		--container "$${AZURE_CONTAINER:-orca-test}" \
+		$(SEED_ARGS)
+
+seed-upload: ## Upload a file to the Azurite origin (use FILE=/path/to/file)
+	@[ -n "$(FILE)" ] || { echo "Usage: make seed-upload FILE=/path/to/file [SEED_ARGS='--name foo']" >&2; exit 1; }
+	@if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \
+	go run "$(REPO_ROOT)/hack/cmd/orcaseed" upload \
+		--endpoint "$(SEED_ENDPOINT)" \
+		--container "$${AZURE_CONTAINER:-orca-test}" \
+		--file "$(FILE)" \
+		$(SEED_ARGS)
+
+seed-list: ## List blobs in the Azurite origin container
+	@if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \
+	go run "$(REPO_ROOT)/hack/cmd/orcaseed" list \
+		--endpoint "$(SEED_ENDPOINT)" \
+		--container "$${AZURE_CONTAINER:-orca-test}" \
+		$(SEED_ARGS)
+
+seed-delete: ## Delete blobs from the Azurite origin container (use PREFIX=foo)
+	@if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \
+	go run "$(REPO_ROOT)/hack/cmd/orcaseed" delete \
+		--endpoint "$(SEED_ENDPOINT)" \
+		--container "$${AZURE_CONTAINER:-orca-test}" \
+		--prefix "$(PREFIX)" \
+		$(SEED_ARGS)
diff --git a/hack/orca/deploy-credentials.sh b/hack/orca/deploy-credentials.sh
new file mode 100755
index 00000000..0d8d8045
--- /dev/null
+++ b/hack/orca/deploy-credentials.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# deploy-credentials.sh - create the orca-credentials Secret holding
+# Azure Blob and S3 cachestore credentials. Sourced from .env so secret
+# values never land in YAML.
+#
+# The dev harness defaults to ORIGIN_DRIVER=awss3 (LocalStack as both
+# origin and cachestore), in which case AZURE_STORAGE_KEY is optional
+# and the Azure key is omitted from the Secret. If you switch to
+# ORIGIN_DRIVER=azureblob, AZURE_STORAGE_KEY becomes required.
+set -euo pipefail
+
+CLUSTER_NAME=${CLUSTER_NAME:?CLUSTER_NAME must be set}
+NAMESPACE=${NAMESPACE:?NAMESPACE must be set}
+ENV_FILE=${ENV_FILE:?ENV_FILE must be set}
+
+if [[ -f "${ENV_FILE}" ]]; then
+  set -a
+  # shellcheck disable=SC1090
+  . "${ENV_FILE}"
+  set +a
+else
+  echo "Note: ${ENV_FILE} not found; proceeding with default awss3 origin (LocalStack)."
+fi
+
+ORIGIN_DRIVER=${ORIGIN_DRIVER:-awss3}
+
+# LocalStack accepts any non-empty creds; pin to test/test for parity
+# with manual aws-cli calls in the init Job. Both the cachestore and
+# (when the awss3 origin driver targets in-cluster LocalStack) the
+# origin use the same creds.
+ORCA_CACHESTORE_S3_ACCESS_KEY=${ORCA_CACHESTORE_S3_ACCESS_KEY:-test}
+ORCA_CACHESTORE_S3_SECRET_KEY=${ORCA_CACHESTORE_S3_SECRET_KEY:-test}
+ORCA_AWSS3_ACCESS_KEY=${ORCA_AWSS3_ACCESS_KEY:-test}
+ORCA_AWSS3_SECRET_KEY=${ORCA_AWSS3_SECRET_KEY:-test}
+
+# Build the kubectl literal flags conditionally so we don't ship empty
+# strings as Azure keys in awss3 mode.
+literals=(
+  "--from-literal=ORCA_CACHESTORE_S3_ACCESS_KEY=${ORCA_CACHESTORE_S3_ACCESS_KEY}"
+  "--from-literal=ORCA_CACHESTORE_S3_SECRET_KEY=${ORCA_CACHESTORE_S3_SECRET_KEY}"
+  "--from-literal=ORCA_AWSS3_ACCESS_KEY=${ORCA_AWSS3_ACCESS_KEY}"
+  "--from-literal=ORCA_AWSS3_SECRET_KEY=${ORCA_AWSS3_SECRET_KEY}"
+)
+
+case "${ORIGIN_DRIVER}" in
+  azureblob)
+    # In azureblob+Azurite mode (no real Azure account), fall back to
+    # the well-known Azurite dev key. This is a public, documented
+    # constant baked into Azurite -- not a secret.
+    #
+    # Gate the fallback on AZURE_STORAGE_ACCOUNT being empty or the
+    # well-known Azurite account name. If the operator set a real
+    # account but forgot the key, hard-fail rather than silently
+    # injecting the Azurite dev key into the Secret (which would
+    # auth-fail at runtime against the real account and obscure the
+    # real problem).
+    if [[ -z "${AZURE_STORAGE_KEY:-}" ]]; then
+      case "${AZURE_STORAGE_ACCOUNT:-}" in
+        ""|"devstoreaccount1")
+          AZURITE_DEV_KEY="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="
+          echo "AZURE_STORAGE_KEY not set; using Azurite well-known dev key (account: devstoreaccount1)."
+          AZURE_STORAGE_KEY="${AZURITE_DEV_KEY}"
+          ;;
+        *)
+          echo "ERROR: AZURE_STORAGE_KEY is required when AZURE_STORAGE_ACCOUNT=${AZURE_STORAGE_ACCOUNT}." >&2
+          echo "The Azurite well-known dev key fallback only applies to account 'devstoreaccount1'." >&2
+          exit 1
+          ;;
+      esac
+    fi
+    literals+=("--from-literal=ORCA_AZUREBLOB_ACCOUNT_KEY=${AZURE_STORAGE_KEY}")
+    ;;
+  awss3)
+    if [[ -n "${AZURE_STORAGE_KEY:-}" ]]; then
+      # Allow it to be present so reviewers can switch drivers without
+      # editing secrets each time.
+      literals+=("--from-literal=ORCA_AZUREBLOB_ACCOUNT_KEY=${AZURE_STORAGE_KEY}")
+    fi
+    ;;
+  *)
+    echo "ERROR: unknown ORIGIN_DRIVER=${ORIGIN_DRIVER}" >&2
+    exit 1
+    ;;
+esac
+
+echo "Creating/updating Secret orca-credentials in namespace ${NAMESPACE} (origin driver: ${ORIGIN_DRIVER}) ..."
+kubectl --context "kind-${CLUSTER_NAME}" -n "${NAMESPACE}" create secret generic orca-credentials \
+  "${literals[@]}" \
+  --dry-run=client -o yaml | kubectl --context "kind-${CLUSTER_NAME}" apply -f -
+
+echo "orca-credentials Secret applied."
diff --git a/hack/orca/dev-harness.md b/hack/orca/dev-harness.md
new file mode 100644
index 00000000..5147dff9
--- /dev/null
+++ b/hack/orca/dev-harness.md
@@ -0,0 +1,336 @@
+<!-- Copyright (c) Microsoft Corporation. Licensed under the MIT License. -->
+
+# Orca Dev Harness
+
+A local end-to-end harness for the Orca origin cache. Stands up a Kind
+cluster with three Orca replicas, an in-cluster LocalStack as the
+cachestore, and an in-cluster origin (LocalStack S3 by default; Azurite
+when `ORIGIN_DRIVER=azureblob`). Both default paths run with zero real
+cloud credentials. The harness can also be flipped to point at a real
+Azure Blob storage account.
+
+This document covers a single workstation. For the production
+architecture and design rationale, see `design/orca/`. For Go-level
+integration tests that exercise the same code paths without Kubernetes
+(via testcontainers-managed LocalStack and Azurite), see
+[inttest.md](./inttest.md). The two harnesses are complementary: this
+one validates the K8s deployment shape (manifests, headless DNS, image
+build/load); the integration tests cover the Go runtime behavior.
+
+## Origin modes
+
+| `ORIGIN_DRIVER` value | Origin backend | Driver path exercised | Creds needed |
+| --------------------- | -------------- | --------------------- | ------------ |
+| `awss3` (default)     | LocalStack S3 (in-cluster) | `internal/orca/origin/awss3` | None |
+| `azureblob` (Azurite) | Azurite (in-cluster) | `internal/orca/origin/azureblob` | None (well-known dev key) |
+| `azureblob` (real Azure) | Azure Blob Storage | `internal/orca/origin/azureblob` | Account + key in `.env` |
+
+The cachestore is always in-cluster LocalStack S3 (different bucket
+from the awss3 origin).
+
+## What you get
+
+- A Kind cluster named `orca-dev` with one control plane and three
+  worker nodes (one per Orca replica via required pod-anti-affinity).
+- LocalStack 3.8 running in the cluster as the S3-compatible
+  cachestore (and origin in `awss3` mode). Community tier (`latest`
+  is Pro-only and exits with code 55 "License activation failed").
+- Azurite (Microsoft's official Azure Storage emulator) deployed on
+  demand when `ORIGIN_DRIVER=azureblob`. Runs from
+  `mcr.microsoft.com/azure-storage/azurite`.
+- Buckets/containers pre-created by init Jobs:
+  - `orca-cache` (S3) - cachestore (versioning unset; Orca's
+    versioningGate rejects Enabled and Suspended).
+  - `orca-origin` (S3) - origin (used when `ORIGIN_DRIVER=awss3`).
+  - `orca-test` (Azure container) - origin (used when `ORIGIN_DRIVER=azureblob`).
+- Three Orca replicas. mTLS between peers and bearer auth for
+  clients are both disabled in dev (`cluster.internal_tls.enabled=false`,
+  `server.auth.enabled=false`).
+- Helper scripts (seed sample blobs, GET, LIST, clear cache, tail logs).
+
+## Prerequisites
+
+- `kind` (https://kind.sigs.k8s.io/), `kubectl`, `podman` (or `docker`).
+- `go` toolchain (for `go run ./hack/cmd/render-manifests`).
+- Optional (Azure mode only): a real Azure Storage account + container
+  + account key.
+
+No real cloud credentials are required for the default flow.
+
+## One-time setup
+
+```bash
+cp hack/orca/.env.example hack/orca/.env
+# Default values work; only edit if you want Azure mode.
+```
+
+`.env` is git-ignored. The default `ORIGIN_DRIVER=awss3` runs entirely
+on the in-cluster LocalStack.
+
+## Bring it up
+
+```bash
+make -C hack/orca up
+```
+
+This runs, in order:
+
+1. `kind-create` - create the `orca-dev` cluster (idempotent).
+2. `image` - build `ghcr.io/azure/orca:dev` via `make image-orca-local`.
+3. `kind-load` - save the image to a tar and `kind load image-archive`.
+4. `render` - render `deploy/orca/*.yaml.tmpl` with values from `.env`.
+5. `render-dev` - render `deploy/orca/dev/*.yaml.tmpl` (LocalStack, Azurite, init Jobs).
+6. `deploy-localstack` - apply the namespace, LocalStack, wait until
+   ready, run the bucket-init Job (creates `orca-cache` + `orca-origin`),
+   wait for completion.
+7. `deploy-azurite-maybe` - if `ORIGIN_DRIVER=azureblob`, deploy
+   Azurite + run its container-init Job. Skipped for `awss3`.
+8. `deploy-credentials` - create the `orca-credentials` Secret.
+9. `deploy-orca` - apply RBAC, ConfigMap, Services, Deployment.
+10. `wait-ready` - block until all 3 replicas are Ready.
+
+When this finishes you should see something like:
+
+```
+$ make -C hack/orca status
+NAME                                READY   STATUS    RESTARTS   AGE
+azurite-...                         1/1     Running   0          1m   (only in azureblob mode)
+localstack-...                      1/1     Running   0          1m
+orca-azurite-container-init-...     0/1     Completed 0          1m   (only in azureblob mode)
+orca-buckets-init-...               0/1     Completed 0          1m
+orca-7c5d4f9b8c-...                 1/1     Running   0          50s
+orca-7c5d4f9b8c-...                 1/1     Running   0          50s
+orca-7c5d4f9b8c-...                 1/1     Running   0          50s
+```
+
+## Switching origins
+
+Edit `hack/orca/.env`, change `ORIGIN_DRIVER`, then:
+
+```bash
+make -C hack/orca down
+make -C hack/orca up
+```
+
+Or, to keep the cluster but reconfigure Orca and pull in any newly
+needed backends:
+
+```bash
+$EDITOR hack/orca/.env
+make -C hack/orca deploy        # idempotent; brings up Azurite if needed
+make -C hack/orca reset         # rolling-restart Orca with new ConfigMap
+```
+
+## Seed sample data
+
+The dev harness ships a small Go tool, `hack/cmd/orcaseed`, that
+populates the origin container (Azurite or real Azure) with synthetic
+or operator-supplied content. For the canonical recipe (Azurite
+endpoint via NodePort 30100, the four subcommands wrapped as Make
+targets, the per-blob ceiling, etc.) see
+[quickstart.md - Step 3](./quickstart.md#step-3---seed-the-origin).
+
+For real Azure storage, the `seed-azure` Make target invokes
+`orcaseed upload` against your account using credentials from `.env`:
+
+```bash
+make -C hack/orca seed-azure FILE=/path/to/local-file
+```
+
+This replaces the legacy `seed-azure.sh` script (retired). Required
+in `.env`: `AZURE_STORAGE_ACCOUNT`, `AZURE_STORAGE_KEY`,
+`AZURE_CONTAINER`. The endpoint is computed as
+`https://<account>.blob.core.windows.net/`.
+
+For ad-hoc seeding into the in-cluster LocalStack S3 origin (the
+default `awss3` mode), `orcaseed` does not currently speak S3; use a
+one-off Job:
+
+```bash
+kubectl --context kind-orca-dev -n unbounded-kube run orca-seed --rm -it \
+  --image=amazon/aws-cli:latest --restart=Never \
+  --env=AWS_ACCESS_KEY_ID=test \
+  --env=AWS_SECRET_ACCESS_KEY=test \
+  -- \
+  --endpoint-url http://localstack.unbounded-kube.svc.cluster.local:4566 \
+  s3 cp /tmp/your-file s3://orca-origin/your-key
+```
+
+## Exercise the cache
+
+See [quickstart.md - Steps 4-5](./quickstart.md#step-4---port-forward-the-orca-edge)
+for the port-forward + `curl` walkthrough. The cluster-wide
+deduplication, singleflight collapse, and warm-cache behavior are
+verified deterministically by `make orca-inttest` against
+testcontainers; this Kind harness is for validating the Kubernetes
+deployment shape (manifests, image, headless DNS, RBAC, init-Job
+ordering) and for ad-hoc operator exploration.
+
+## See cluster-wide deduplication in action
+
+The integration test `TestSingleflightCollapse` (under
+`internal/orca/inttest/`) deterministically asserts this behavior
+with byte-exact body checks and a `CountingOrigin` decorator. To
+reproduce manually against this harness, fire concurrent GETs of a
+fresh blob and tail the logs:
+
+```bash
+make -C hack/orca logs
+```
+
+You should see exactly one chunk-fill per chunk-key across the
+cluster (coordinator selected by rendezvous-hash). Replicas that
+received the client request but are not the coordinator forward via
+`/internal/fill`. Once a chunk is committed to the cachestore,
+subsequent GETs (and joiners that arrived during the fill) read from
+cache.
+
+## Switching to Azure mode (real Azure)
+
+Edit `hack/orca/.env` and set:
+
+```
+ORIGIN_DRIVER=azureblob
+ORIGIN_ID=azureblob-real
+AZURE_STORAGE_ACCOUNT=<your-account>
+AZURE_STORAGE_KEY=<your-key>
+AZURE_CONTAINER=<your-container>
+AZUREBLOB_ENDPOINT=                # leave blank for real Azure
+```
+
+Then:
+
+```bash
+make -C hack/orca deploy                          # idempotent
+make -C hack/orca seed-azure FILE=/path/to/file   # uploads via orcaseed -> real Azure
+make -C hack/orca reset
+```
+
+The `seed-azure` target uses `hack/cmd/orcaseed` under the hood,
+constructing the endpoint as `https://<account>.blob.core.windows.net/`
+and authenticating with `AZURE_STORAGE_KEY`. Pass `SEED_ARGS='--name foo'`
+to override the destination blob name.
+
+## Reset / iterate
+
+```bash
+# Rebuild the image and rolling-restart the deployment:
+make -C hack/orca reset
+
+# Tear down the whole Kind cluster:
+make -C hack/orca down
+```
+
+To clear the cachestore bucket between manual experiments, exec into
+the LocalStack pod or run a one-off `aws s3 rm s3://orca-cache --recursive`
+job; the prior canned script was retired alongside the seeding helpers.
+
+## Logging
+
+The Orca pods default to info-level structured JSON logging. Set
+`LOG_LEVEL=debug` in `hack/orca/.env` (then `make -C hack/orca deploy
+&& make -C hack/orca reset`) for persistent per-chunk debug tracing,
+or `kubectl set env deployment/orca ORCA_LOG_LEVEL=debug` for a
+one-off runtime override. See
+[quickstart.md - Step 6](./quickstart.md#step-6---watch-the-per-chunk-debug-trace)
+for the structured-log shape and `jq` filter examples.
+
+## Troubleshooting
+
+### `localstack` deployment never goes Ready
+
+Check the LocalStack pod's logs:
+
+```bash
+kubectl --context kind-orca-dev -n unbounded-kube logs deploy/localstack
+```
+
+If you see "License activation failed" with exit code 55, you're on the
+Pro-only `latest` tag. The dev harness pins `localstack/localstack:3.8`
+specifically to avoid this.
+
+### `azurite` deployment never goes Ready (azureblob mode)
+
+Check the Azurite logs:
+
+```bash
+kubectl --context kind-orca-dev -n unbounded-kube logs deploy/azurite
+```
+
+Most commonly the readiness probe is failing because Azurite was
+launched with `--blobHost 127.0.0.1` (default) instead of `0.0.0.0`.
+The harness's manifest already passes the right flag; if you've
+overridden `AzuriteImage` to a custom build, ensure it accepts the
+flag.
+
+### `orca-buckets-init` Job fails
+
+The Job waits up to 120 seconds for LocalStack readiness, then creates
+both `orca-cache` and `orca-origin` and verifies cachestore versioning
+is unset. Failures are typically LocalStack startup taking longer than
+that on a slow disk; rerun the Job:
+
+```bash
+kubectl --context kind-orca-dev -n unbounded-kube delete job orca-buckets-init --ignore-not-found
+make -C hack/orca deploy-localstack
+```
+
+### Orca pods CrashLoopBackOff with "config invalid: ..."
+
+Check what's missing:
+
+```bash
+kubectl --context kind-orca-dev -n unbounded-kube logs deploy/orca | head
+```
+
+Common causes:
+- In Azure mode, an empty `AZURE_STORAGE_ACCOUNT`/`AZURE_CONTAINER`
+  (rendered into the ConfigMap).
+- A missing `orca-credentials` Secret.
+
+Fix:
+
+```bash
+$EDITOR hack/orca/.env
+make -C hack/orca render        # re-render ConfigMap from .env
+make -C hack/orca deploy-credentials
+kubectl --context kind-orca-dev -n unbounded-kube apply -f deploy/orca/rendered/03-config.yaml
+make -C hack/orca reset
+```
+
+### "OriginUnreachable" or 502 from manual GETs
+
+In awss3 (default) mode:
+- The bucket name in the URL must match `ORIGIN_AWSS3_BUCKET` (default
+  `orca-origin`).
+- Seed the bucket manually with `kubectl run orca-seed --rm -it
+  --image=amazon/aws-cli:latest -- ...`.
+
+In Azure mode:
+- Account key wrong or revoked. Re-run `make -C hack/orca deploy-credentials && make -C hack/orca reset`.
+- The blob doesn't exist in `$AZURE_CONTAINER`. Run `make -C hack/orca seed-azure`.
+
+### kind load fails with "tag not found"
+
+The `make image` target tags the image as `ghcr.io/azure/orca:dev` (the
+default `ORCA_VERSION=dev`). If you overrode `VERSION` and got a slash
+in the tag (git describe can produce e.g.
+`images/agent-ubuntu2404-nvidia/v...-dirty`), the OCI tag is invalid.
+Stick with `ORCA_VERSION=dev` for the dev harness.
+
+## What this harness does NOT cover
+
+- `cachestore/posixfs` and `cachestore/localfs` drivers (deferred; v1
+  prototype has only `cachestore/s3`).
+- Production auth (bearer tokens, mTLS edge, internal mTLS). All three
+  are disabled by config in dev.
+- Edge rate limiting and dynamic per-replica origin caps (see s15
+  deferred-optimizations in `design/orca/design.md`).
+- Mid-stream origin resume; if origin stalls after first byte the
+  client sees a truncated body. Acceptable for the prototype.
+- Crash recovery / unowned-key sweep (post-MVP).
+
+For more on what's in vs out of scope, see `design/orca/design.md`
+(in particular the
+[Deferred / future work](../../designs/orca/design.md#15-deferred--future-work)
+section).
diff --git a/hack/orca/down.sh b/hack/orca/down.sh
new file mode 100755
index 00000000..3d59a7c8
--- /dev/null
+++ b/hack/orca/down.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# down.sh - delete the Orca dev Kind cluster.
+set -euo pipefail
+
+CLUSTER_NAME=${CLUSTER_NAME:?CLUSTER_NAME must be set}
+
+if ! command -v kind >/dev/null 2>&1; then
+  echo "kind is not installed; nothing to do." >&2
+  exit 0
+fi
+
+if ! kind get clusters 2>/dev/null | grep -qx "${CLUSTER_NAME}"; then
+  echo "No Kind cluster named '${CLUSTER_NAME}'; nothing to delete."
+  exit 0
+fi
+
+echo "Deleting Kind cluster '${CLUSTER_NAME}' ..."
+kind delete cluster --name "${CLUSTER_NAME}"
diff --git a/hack/orca/inttest.md b/hack/orca/inttest.md
new file mode 100644
index 00000000..29a737d2
--- /dev/null
+++ b/hack/orca/inttest.md
@@ -0,0 +1,215 @@
+<!-- Copyright (c) Microsoft Corporation. Licensed under the MIT License. -->
+
+# Orca Integration Tests
+
+In-process integration tests for the Orca origin cache. The harness
+brings up real LocalStack and Azurite containers via
+`testcontainers-go` and constructs N in-process `*app.App` instances
+wired to those containers. No Kubernetes cluster is required.
+
+For the Kubernetes-flavored deployment validation harness (Kind +
+manifests + headless DNS), see [dev-harness.md](./dev-harness.md). The
+two harnesses are complementary: the integration tests cover Go-level
+behavior (origin, cachestore, fetch coordinator, cluster routing,
+internal-fill RPC); the dev harness covers the manifest + deployment
+shape.
+
+## Prerequisites
+
+- Docker (or any `DOCKER_HOST`-compatible daemon) reachable from the
+  test process. `testcontainers-go` discovers it via `DOCKER_HOST`,
+  `~/.docker/`, or the standard socket location.
+- `gcc` for `-race` (CGO is required by Go's race detector). On
+  GitHub-hosted Ubuntu runners this is preinstalled. Locally without
+  `gcc`, the Makefile target drops `-race` automatically.
+
+## Running
+
+```sh
+make orca-inttest
+```
+
+Equivalent to:
+
+```sh
+go test -tags=integrationtest -timeout 15m ./internal/orca/inttest/...
+# CI also adds -race
+```
+
+First run pulls `localstack/localstack:3.8` (~700 MB) and
+`mcr.microsoft.com/azure-storage/azurite:3.34.0` (~150 MB). Subsequent
+runs reuse the cached images. Total run time on a warm runner is on
+the order of 25-30 seconds for the entire suite (most of which is
+streaming the 64 MiB multi-chunk blob through the full origin ->
+fetch coordinator -> cachestore pipeline).
+
+## Topology
+
+Every test (except the lifecycle tests) runs against a 3-replica
+in-process cluster, matching the production `deploy/orca` topology.
+All replicas bind to `127.0.0.1` with distinct OS-assigned internal
+ports. Each replica owns its own `StaticPeerSource` so tests can
+mutate one replica's view of the cluster independently.
+
+```
+                  ┌──────────────────────────────────────┐
+                  │           Test Process               │
+                  │                                      │
+   ┌─────────┐    │  ┌──────────┐    ┌───────────────┐   │
+   │ Test t  │────┼─▶│  Client  │───▶│ Replica 1     │   │
+   └─────────┘    │  │ (HTTP)   │    │ 127.0.0.1:e1  │   │
+                  │  └──────────┘    │ internal :i1  │   │
+                  │                  └───────┬───────┘   │
+                  │  ┌─────────────┐         │ peers     │
+                  │  │ Per-replica │◀────────┤ via       │
+                  │  │ Static      │         │ static    │
+                  │  │ PeerSources │         │ source    │
+                  │  └─────────────┘         │           │
+                  │                  ┌───────▼───────┐   │
+                  │                  │ Replica 2     │   │
+                  │                  │ 127.0.0.1:e2  │   │
+                  │                  │ internal :i2  │   │
+                  │                  └───────┬───────┘   │
+                  │                  ┌───────▼───────┐   │
+                  │                  │ Replica 3     │   │
+                  │                  │ 127.0.0.1:e3  │   │
+                  │                  │ internal :i3  │   │
+                  │                  └───────┬───────┘   │
+                  └──────────────────────────┼───────────┘
+                                             │
+                          ┌──────────────────┴───────────┐
+                          ▼                              ▼
+                  ┌────────────────┐            ┌────────────┐
+                  │  LocalStack    │            │  Azurite   │
+                  │  (origin S3 +  │            │  (origin   │
+                  │   cachestore)  │            │   blob)    │
+                  └────────────────┘            └────────────┘
+```
+
+## File layout
+
+```
+internal/orca/inttest/
+├── doc.go              package overview, build tag, TODOs
+├── images.go           pinned container image tags + Azurite dev creds
+├── localstack.go       testcontainers wrapper + S3 helpers
+├── azurite.go          testcontainers wrapper + azblob helpers
+├── seed.go             SmallBlob/MediumBlob/LargeBlob + SeedS3/SeedAzure
+├── peersource.go       StaticPeerSource (cluster.PeerSource impl)
+├── harness.go          StartCluster orchestrator
+├── client.go           typed HTTP helpers (Get / GetRange / Head / List)
+├── originwrap.go       CountingOrigin decorator
+├── internalwrap.go     CountingInternalHandlerWrap (per-IP status counts)
+├── origins_test.go     origin builder helpers
+├── main_test.go        TestMain (shared LocalStack + Azurite)
+├── e2e_test.go         canonical 3-replica end-to-end suite
+└── azure_test.go       azureblob origin smoke (3 replicas)
+```
+
+Driver-level branch coverage (versioning gate, blob-type rejection)
+lives as fast unit tests in the respective driver packages
+(`internal/orca/cachestore/s3`, `internal/orca/origin/azureblob`),
+not here. Those tests run as part of `go test ./...` and cover all
+state branches (empty / Enabled / Suspended versioning;
+BlockBlob / PageBlob / AppendBlob / nil / disabled).
+
+## Test inventory
+
+The integration suite contains **7 tests** focused exclusively on
+behavior that requires real LocalStack/Azurite + a real cluster of
+in-process orca instances. Driver-level branch coverage (versioning
+gate, blob-type rejection, HTTP error mapping, range parsing, chunk
+arithmetic, config env-var fallback, manifest YAML validity) lives as
+fast unit tests in the respective packages and runs as part of
+`make test`.
+
+### `e2e_test.go` (3-replica default)
+
+Tests that exercise chunk fetching naturally exercise both the
+local-fill path (when self happens to win rendezvous for a chunk) and
+the cross-replica `/internal/fill` path (when a peer wins).
+
+- `TestColdAndWarmGet` - cold + warm, warm phase deletes origin
+  object first to prove cache hit.
+- `TestRangedGet` - within-chunk and cross-chunk byte ranges plus
+  several boundary edge cases against a 64-chunk blob (range starts
+  exactly at a boundary, ends exactly at a boundary, covers
+  contiguous full chunks, straddles 5 consecutive boundaries).
+- `TestMultiChunkGet` - 64 MiB / 64 chunks, byte-exact full GET. With
+  3 replicas, statistically every replica is the coordinator for
+  many chunks, exercising both fillLocal and FillFromPeer paths.
+- `TestRendezvousCoordinatorRouting` - GET against a non-coordinator
+  routes through `/internal/fill`; `CountingOrigin` confirms exactly
+  one origin GetRange happened cluster-wide.
+- `TestSingleflightCollapse` - 3 concurrent GETs from 3 replicas for
+  the same 64-chunk blob collapse to >= 64 (and <= 76) origin
+  GetRanges, proving cluster-wide singleflight is genuinely deduping.
+- `TestPeerNotCoordinatorFallback` - real membership-disagreement
+  test. Crafts a phantom peer whose rendezvous score beats the
+  coord's for k, mutates the coord's `StaticPeerSource` to include
+  the phantom, GET via a non-coord replica that still views the real
+  coord as coordinator, asserts (a) byte-exact body and (b)
+  `counter409.Count(coord) >= 1` proving the 409 fallback fired.
+
+### `azure_test.go` (3-replica default)
+
+- `TestAzureBlobOrigin_ColdGet` - the `azureblob` driver works
+  end-to-end against Azurite for a 2-chunk block blob.
+
+### Where the dropped scenarios moved
+
+| Dropped from integration | Lives now as |
+|---|---|
+| `TestBootSelfTest_Pass` | implicit in every other `StartCluster` test (boots through the same `app.Start` path) |
+| `TestNotFound` | `internal/orca/server.TestWriteOriginError` (covers all 5 error mappings) |
+| `TestList` | `internal/orca/server.TestHandleList` (covers normal/empty/truncated/error) |
+| `TestHead` | `internal/orca/server.TestHandleHead` (covers normal/missing-fields/404) |
+| `TestVersionedCachestoreBucketRefused` | `internal/orca/cachestore/s3.TestValidateBucketVersioning` (covers all 3 statuses) |
+| `TestAzureUnsupportedBlobType` | `internal/orca/origin/azureblob.TestValidateBlobType` (covers all 5 cases) |
+
+## Production-code seams used
+
+The harness depends on three test-friendly seams in production code:
+
+1. **`cluster.PeerSource`**: replaces the entire peer-discovery
+   mechanism. Production constructs a DNS-backed source implicitly
+   from `cfg.Cluster.Service` + `net.DefaultResolver`. Tests inject
+   per-replica `StaticPeerSource` instances with explicit ports so
+   multiple replicas can share an IP.
+
+2. **`cluster.Peer.Port`**: zero in production (peer addressed on
+   `cfg.Cluster.InternalListen` port); set in tests so `FillFromPeer`
+   dials each peer's distinct port.
+
+3. **`internal/orca/app.Start(ctx, *config.Config, ...Option)`**:
+   programmatic factory wiring origin / cachestore / cluster / fetch
+   coordinator / edge + internal listeners. Options:
+   - `WithLogger`, `WithResolver`, `WithPeerSource`,
+   - `WithOrigin`, `WithCacheStore`, `WithSkipCachestoreSelfTest`,
+   - `WithInternalHandlerWrap` for the 409 counter.
+
+Production goes through none of these.
+
+## Adding a scenario
+
+1. Pick the right entry point:
+   - 3-replica e2e (most cases): `StartCluster(ctx, t, opts)`.
+   - Driver-level branch coverage (versioning gate, blob-type
+     rejection, etc.): write a unit test in the driver's package
+     against the extracted pure helpers (`validateBucketVersioning`,
+     `validateBlobType`).
+2. Seed the origin: `SeedS3` or `SeedAzure`.
+3. Issue requests via `cl.Get(i).HTTP.Get / GetRange / Head / List`.
+4. Assert byte-exact body, status code, and (where relevant) origin
+   RPC counts via `CountingOrigin` (`opts.OriginOverride`) or peer
+   409 counts via `CountingInternalHandlerWrap`
+   (`opts.InternalHandlerWrap`).
+
+## Future work
+
+Tracked in `doc.go` TODOs:
+
+- `TestEtagChange` (mid-fill mutation): requires a deterministic test
+  seam in `fetch.Coordinator` to pause between chunk fetches.
+- Fault-injection origin / cachestore decorators: timeout, throttle,
+  5xx retry-budget assertions.
diff --git a/hack/orca/kind-config.yaml b/hack/orca/kind-config.yaml
new file mode 100644
index 00000000..0f5b2d21
--- /dev/null
+++ b/hack/orca/kind-config.yaml
@@ -0,0 +1,22 @@
+# Kind cluster config for the Orca dev harness.
+#
+# 1 control-plane + 3 workers. The 3 workers match Orca's default
+# replica count and the required pod-anti-affinity (hostname topology).
+#
+# extraPortMappings on the first worker exposes Azurite's NodePort
+# (default 30100) to the host so the seeder tool (hack/cmd/orcaseed)
+# can reach Azurite at http://localhost:30100/devstoreaccount1/
+# without a kubectl port-forward. NodePort services in Kind aren't
+# routable from the host without explicit port mappings.
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+name: orca-dev
+nodes:
+  - role: control-plane
+  - role: worker
+    extraPortMappings:
+      - containerPort: 30100
+        hostPort: 30100
+        protocol: TCP
+  - role: worker
+  - role: worker
diff --git a/hack/orca/kind-create.sh b/hack/orca/kind-create.sh
new file mode 100755
index 00000000..4b0300ab
--- /dev/null
+++ b/hack/orca/kind-create.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# kind-create.sh - create the Orca dev Kind cluster idempotently.
+set -euo pipefail
+
+CLUSTER_NAME=${CLUSTER_NAME:?CLUSTER_NAME must be set}
+KIND_CONFIG=${KIND_CONFIG:?KIND_CONFIG must be set}
+
+if ! command -v kind >/dev/null 2>&1; then
+  echo "kind is not installed. See https://kind.sigs.k8s.io/docs/user/quick-start/#installation" >&2
+  exit 1
+fi
+
+if kind get clusters 2>/dev/null | grep -qx "${CLUSTER_NAME}"; then
+  echo "Kind cluster '${CLUSTER_NAME}' already exists; skipping creation."
+  exit 0
+fi
+
+echo "Creating Kind cluster '${CLUSTER_NAME}' from ${KIND_CONFIG} ..."
+kind create cluster --name "${CLUSTER_NAME}" --config "${KIND_CONFIG}" --wait 120s
+
+echo "Cluster ready. Current context:"
+kubectl config current-context
diff --git a/hack/orca/kind-load.sh b/hack/orca/kind-load.sh
new file mode 100755
index 00000000..c1b51d8d
--- /dev/null
+++ b/hack/orca/kind-load.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+# kind-load.sh - sideload the Orca container image into the Kind nodes.
+#
+# Kind clusters can't pull from the local container engine's image
+# store directly. This script saves the image to a tarball with the
+# configured CONTAINER_ENGINE and feeds it to `kind load image-archive`.
+set -euo pipefail
+
+CLUSTER_NAME=${CLUSTER_NAME:?CLUSTER_NAME must be set}
+ORCA_IMAGE=${ORCA_IMAGE:?ORCA_IMAGE must be set}
+CONTAINER_ENGINE=${CONTAINER_ENGINE:-podman}
+
+if ! command -v kind >/dev/null 2>&1; then
+  echo "kind is not installed." >&2
+  exit 1
+fi
+
+tmpdir=$(mktemp -d)
+trap 'rm -rf "${tmpdir}"' EXIT
+
+archive="${tmpdir}/orca.tar"
+echo "Saving ${ORCA_IMAGE} to ${archive} via ${CONTAINER_ENGINE} ..."
+"${CONTAINER_ENGINE}" save -o "${archive}" "${ORCA_IMAGE}"
+
+echo "Loading image into Kind cluster '${CLUSTER_NAME}' ..."
+kind load image-archive "${archive}" --name "${CLUSTER_NAME}"
+
+echo "Image loaded."
diff --git a/hack/orca/quickstart.md b/hack/orca/quickstart.md
new file mode 100644
index 00000000..d3a7e38b
--- /dev/null
+++ b/hack/orca/quickstart.md
@@ -0,0 +1,209 @@
+<!-- Copyright (c) Microsoft Corporation. Licensed under the MIT License. -->
+
+# Orca Dev Cluster Quickstart
+
+End-to-end recipe to stand up a local Kind cluster with Orca pointed
+at an in-cluster Azurite origin and a LocalStack S3 cachestore, then
+seed data and exercise the cache with debug-level traces.
+
+For the longer reference (every Make target, troubleshooting,
+prerequisites, switching origin modes), see [dev-harness.md](./dev-harness.md).
+
+## Prerequisites
+
+- `kind`, `kubectl`, `podman` (or `docker`).
+- `go` toolchain (used to build the orca image and run the
+  `hack/cmd/orcaseed` tool).
+
+## Step 1 - One-time setup
+
+Copy the example env file and edit it for Azurite-with-debug:
+
+```bash
+cp hack/orca/.env.example hack/orca/.env
+$EDITOR hack/orca/.env
+```
+
+Set:
+
+```
+ORIGIN_DRIVER=azureblob
+ORIGIN_ID=azureblob-azurite
+AZURE_CONTAINER=orca-test
+LOG_LEVEL=debug
+```
+
+Leave `AZURE_STORAGE_ACCOUNT`, `AZURE_STORAGE_KEY`, and
+`AZUREBLOB_ENDPOINT` blank - the harness auto-selects
+`devstoreaccount1` + the well-known Azurite dev key + the in-cluster
+Azurite Service URL.
+
+## Step 2 - Bring up the cluster
+
+```bash
+make orca-up
+```
+
+Single command. Builds the orca image, creates the Kind cluster,
+loads the image, deploys LocalStack + Azurite + Orca, waits until
+all three Orca replicas are Ready. Orca pods start with
+`logging.level: debug` so the per-chunk trace is live from the very
+first request.
+
+Expected pods after bring-up:
+
+```bash
+make -C hack/orca status
+# azurite-...                        1/1 Running
+# localstack-...                     1/1 Running
+# orca-azurite-container-init-...    0/1 Completed
+# orca-buckets-init-...              0/1 Completed
+# orca-...                           1/1 Running   (x3)
+```
+
+## Step 3 - Seed the origin
+
+Azurite is exposed to the host via NodePort `30100` (Kind's
+extraPortMapping forwards it to `localhost:30100`), so no
+`kubectl port-forward` is needed for the seeder.
+
+```bash
+# 5 x 10 MiB random blobs named synth-0 ... synth-4
+make -C hack/orca seed-generate SEED_ARGS='--size 10MiB --count 5'
+
+# Or a single 100 MiB blob named big-0
+make -C hack/orca seed-generate SEED_ARGS='--size 100MiB --count 1 --prefix big-'
+
+# Or upload a real file from disk
+make -C hack/orca seed-upload FILE=~/data.tar.gz
+
+# Reproducible content (same --seed -> byte-identical blobs)
+make -C hack/orca seed-generate SEED_ARGS='--size 10MiB --count 3 --seed 42'
+
+# Inspect / clean up
+make -C hack/orca seed-list
+make -C hack/orca seed-delete PREFIX=synth- SEED_ARGS='--yes'
+```
+
+Per-blob ceiling: 1 GiB unless `--force`. Cumulative-bytes warning at
+1 GiB. The seeder uses chunked uploads, so very large blobs do not
+buffer in host memory.
+
+## Step 4 - Port-forward the Orca edge
+
+In a separate terminal:
+
+```bash
+make -C hack/orca port-forward
+# Forwarding from 127.0.0.1:8443 -> 8443
+```
+
+Leave this running.
+
+## Step 5 - Drive the cache
+
+```bash
+# First hit: cold fill. Triggers origin GetRange, cachestore PutChunk.
+curl -v http://localhost:8443/orca-test/synth-0 -o /dev/null
+
+# Second hit: warm cache. catalog hit -> cachestore_get_chunk.
+curl -v http://localhost:8443/orca-test/synth-0 -o /dev/null
+```
+
+For the bigger blob, you can watch chunked streaming behaviour by
+running the GET against `big-0` (12 chunks at the default 8 MiB
+chunk size) and tailing the logs in parallel.
+
+## Step 6 - Watch the per-chunk debug trace
+
+```bash
+# Filter to one bucket
+make -C hack/orca logs | jq 'select(.chunk.bucket=="orca-test")'
+
+# Filter to one source file (e.g. just fetch coordinator decisions)
+make -C hack/orca logs | jq 'select(.source.file | endswith("fetch.go"))'
+
+# Or just the firehose
+make -C hack/orca logs
+```
+
+On a cold fill you should see a sequence like:
+
+```
+edge_request                       (server.EdgeHandler)
+head_object                        (fetch.Coordinator)
+metadata_singleflight_leader       (metadata.Cache)
+azureblob_head_request / _response (origin/azureblob)
+metadata_record                    (metadata.Cache)
+edge_get_plan                      (server.EdgeHandler)
+get_chunk                          (fetch.Coordinator)
+chunkcatalog_lookup_miss           (chunkcatalog.Catalog)
+cachestore_stat_result present:false (cachestore/s3)
+coordinator_selected               (cluster.Cluster)
+fill_local_lead OR peer_fill_attempt (fetch.Coordinator)
+origin_slot_acquired               (fetch.Coordinator.runFill)
+origin_get_range_attempt           (fetch.fetchWithRetry)
+azureblob_get_range_request / _response (origin/azureblob)
+origin_body_received bytes=N       (fetch.runFill)
+cachestore_put_chunk -> _success   (cachestore/s3)
+commit_success                     (fetch.runFill)
+chunkcatalog_record_insert         (chunkcatalog.Catalog)
+edge_get_complete                  (server.EdgeHandler)
+```
+
+On a warm hit only `chunkcatalog_lookup_hit` and
+`cachestore_get_chunk` fire - no origin call, no commit.
+
+## Step 7 - Iterate
+
+```bash
+# After editing Go source:
+make orca-reset
+# Rebuilds image, side-loads into Kind, rolling-restarts. ~30-60s.
+
+# After editing a manifest template or .env:
+make -C hack/orca deploy        # re-render + apply (idempotent)
+make -C hack/orca reset         # bounce to pick up new ConfigMap
+
+# Clear the cachestore between experiments (forces every chunk back
+# to the cold-fill path on next GET):
+kubectl --context kind-orca-dev -n unbounded-kube exec deploy/localstack -- \
+  awslocal s3 rm s3://orca-cache --recursive
+
+# Clear the origin between experiments:
+make -C hack/orca seed-delete SEED_ARGS='--yes'
+```
+
+## Step 8 - Tear down
+
+```bash
+make orca-down
+```
+
+Deletes the Kind cluster (and everything in it).
+
+## Cheat-sheet of common helpers
+
+| Verb | Effect |
+|---|---|
+| `make orca-up` | Full bring-up (idempotent). |
+| `make orca-reset` | Rebuild image + kind-load + rolling-restart Orca. |
+| `make orca-down` | Delete the Kind cluster. |
+| `make -C hack/orca status` | `kubectl get pods -o wide` in the namespace. |
+| `make -C hack/orca logs` | Tail all Orca pods. |
+| `make -C hack/orca port-forward` | localhost:8443 -> edge service. |
+| `make -C hack/orca seed-generate SEED_ARGS='...'` | Synthetic content. |
+| `make -C hack/orca seed-upload FILE=...` | Upload a real file. |
+| `make -C hack/orca seed-list` | What's in the container. |
+| `make -C hack/orca seed-delete [PREFIX=...]` | Remove blobs. |
+
+## Alternative: integration tests (no Kind cluster)
+
+If you don't need to inspect the K8s deployment shape, the Go-level
+integration suite under `internal/orca/inttest/` covers chunked
+fetch + dedup + peer fallback against testcontainers-managed
+LocalStack + Azurite. Much faster, no Kind setup:
+
+```bash
+make orca-inttest    # ~15-20s, requires Docker
+```
diff --git a/images/orca/Containerfile b/images/orca/Containerfile
new file mode 100644
index 00000000..6a987546
--- /dev/null
+++ b/images/orca/Containerfile
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Build stage
+FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.26.2-trixie AS builder
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    make \
+    gcc \
+    git \
+    ca-certificates \
+    && apt-get clean
+
+ENV CGO_ENABLED=0
+ENV GOPATH=/go
+ENV GOTOOLCHAIN=auto
+ENV PATH=$PATH:/go/bin
+
+WORKDIR /src
+
+COPY go.mod go.sum ./
+RUN go mod download
+
+COPY ../../ .
+
+ARG TARGETOS
+ARG TARGETARCH
+ARG VERSION=dev
+ARG GIT_COMMIT=
+ARG BUILD_TIME=
+RUN GOOS=${TARGETOS} GOARCH=${TARGETARCH} \
+    make orca-build VERSION=${VERSION} ${GIT_COMMIT:+GIT_COMMIT=${GIT_COMMIT}} ${BUILD_TIME:+BUILD_TIME=${BUILD_TIME}}
+
+# Runtime stage
+FROM ubuntu:noble
+
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p /unbounded/bin
+
+COPY --from=builder /src/bin/orca /unbounded/bin/orca
+
+ENV PATH="/unbounded/bin:${PATH}"
+
+WORKDIR /unbounded
+
+ENTRYPOINT ["/unbounded/bin/orca"]
diff --git a/internal/orca/app/app.go b/internal/orca/app/app.go
new file mode 100644
index 00000000..dcbdbdea
--- /dev/null
+++ b/internal/orca/app/app.go
@@ -0,0 +1,572 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package app wires the Orca runtime: origin + cachestore + cluster +
+// fetch coordinator + edge / internal HTTP listeners.
+//
+// Production callers (cmd/orca/orca/orca.go) drive this from a YAML
+// config; integration tests (internal/orca/inttest) drive it from a
+// programmatic *config.Config plus options that inject in-memory or
+// counting decorators around the origin / cachestore.
+package app
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log/slog"
+	"net"
+	"net/http"
+	"sync"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	cachestores3 "github.com/Azure/unbounded/internal/orca/cachestore/s3"
+	"github.com/Azure/unbounded/internal/orca/chunkcatalog"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/fetch"
+	"github.com/Azure/unbounded/internal/orca/metadata"
+	"github.com/Azure/unbounded/internal/orca/origin"
+	"github.com/Azure/unbounded/internal/orca/origin/awss3"
+	"github.com/Azure/unbounded/internal/orca/origin/azureblob"
+	"github.com/Azure/unbounded/internal/orca/server"
+)
+
+// App is a running Orca instance.
+//
+// Construct with Start; tear down with Shutdown. Start is non-blocking:
+// the returned App's listeners are accepting connections (via
+// net.Listen) before Start returns, so EdgeAddr / InternalAddr / OpsAddr
+// are resolved (including any :0 ports) by the time the caller sees them.
+type App struct {
+	// EdgeAddr is the resolved client-edge listen address (host:port).
+	// When the config requested ":0" the port is the OS-assigned one.
+	EdgeAddr string
+
+	// InternalAddr is the resolved peer-RPC listen address (host:port).
+	InternalAddr string
+
+	// OpsAddr is the resolved /healthz + /readyz listen address.
+	OpsAddr string
+
+	// Cluster is exposed so tests can inspect peer state and call
+	// Coordinator/Self for assertions. Production callers should treat
+	// this as read-only.
+	Cluster *cluster.Cluster
+
+	log         *slog.Logger
+	edgeSrv     *http.Server
+	internalSrv *http.Server
+	opsSrv      *http.Server
+	wg          sync.WaitGroup
+	errCh       chan error
+
+	// cachestoreReady is set true once the cachestore self-test has
+	// passed (or skipped via WithSkipCachestoreSelfTest). Gated by
+	// the /readyz endpoint.
+	cachestoreReady bool
+}
+
+type options struct {
+	log                 *slog.Logger
+	clusterOpt          cluster.Option
+	origin              origin.Origin
+	cacheStore          cachestore.CacheStore
+	skipCacheSelfTest   bool
+	internalHandlerWrap func(http.Handler) http.Handler
+	edgeListener        net.Listener
+	internalListener    net.Listener
+	opsListener         net.Listener
+}
+
+// Option configures Start.
+type Option func(*options)
+
+// WithLogger overrides the slog.Logger used for the App's output. If
+// not provided, a JSON handler writing to stdout at LevelInfo is used.
+func WithLogger(log *slog.Logger) Option {
+	return func(o *options) { o.log = log }
+}
+
+// WithPeerSource replaces the cluster's entire peer-discovery
+// mechanism. Intended for integration tests that need full control
+// (e.g. per-replica peer sets with explicit ports). Only one such
+// override is meaningful per App; subsequent calls overwrite.
+func WithPeerSource(s cluster.PeerSource) Option {
+	return func(o *options) {
+		o.clusterOpt = cluster.WithPeerSource(s)
+	}
+}
+
+// WithOrigin replaces the origin driver constructed from cfg. Tests use
+// this to wire counting / fault-injecting decorators around a real
+// awss3 or azureblob client.
+func WithOrigin(or origin.Origin) Option {
+	return func(o *options) { o.origin = or }
+}
+
+// WithCacheStore replaces the cachestore driver constructed from cfg.
+// Tests use this to wire a counting / fault-injecting decorator around
+// a real s3 client (or to use an in-memory implementation).
+func WithCacheStore(cs cachestore.CacheStore) Option {
+	return func(o *options) { o.cacheStore = cs }
+}
+
+// WithSkipCachestoreSelfTest disables the boot-time atomic-commit
+// self-test. Useful only in tests that wire a cachestore decorator
+// already known to honor If-None-Match: *.
+func WithSkipCachestoreSelfTest() Option {
+	return func(o *options) { o.skipCacheSelfTest = true }
+}
+
+// WithInternalHandlerWrap installs a decorator around the internal
+// peer-RPC handler. The wrap function receives the production handler
+// and returns one that the http.Server actually serves. Production
+// passes nothing -> identity. Tests use this to count 409 responses
+// per source IP for the not-coordinator fallback assertion.
+func WithInternalHandlerWrap(wrap func(http.Handler) http.Handler) Option {
+	return func(o *options) { o.internalHandlerWrap = wrap }
+}
+
+// WithEdgeListener supplies a pre-bound listener for the client-edge
+// HTTP server, bypassing app.Start's own net.Listen call.
+//
+// TEST-ONLY: production callers must not use this option. It is
+// exposed for integration tests (internal/orca/inttest) that allocate
+// the listener before the app starts so peer sets can advertise the
+// captured port from t=0 without a close-and-rebind race. Using it in
+// production silently disables the cfg.Server.Listen address.
+func WithEdgeListener(ln net.Listener) Option {
+	return func(o *options) { o.edgeListener = ln }
+}
+
+// WithInternalListener supplies a pre-bound listener for the peer-RPC
+// internal HTTP server.
+//
+// TEST-ONLY: see WithEdgeListener.
+func WithInternalListener(ln net.Listener) Option {
+	return func(o *options) { o.internalListener = ln }
+}
+
+// WithOpsListener supplies a pre-bound listener for the ops HTTP
+// server (/healthz, /readyz).
+//
+// TEST-ONLY: see WithEdgeListener.
+func WithOpsListener(ln net.Listener) Option {
+	return func(o *options) { o.opsListener = ln }
+}
+
+// Start wires every dependency and begins serving on the configured
+// listeners. It returns once all listeners are accepting connections
+// (or returns the error that prevented startup).
+//
+// The returned App must be Shutdown by the caller; Start does not own
+// the parent context's lifetime.
+//
+// Ordering note: cluster.New is called before any listener is bound.
+// Peers can therefore attempt internal-fill RPCs against this replica
+// before its listener is accepting; those connects fail and the
+// requester falls back to local fill via fetch.Coordinator.GetChunk's
+// peer-fallback path. This is transient (sub-second between cluster
+// construction and listener bind) and harmless.
+func Start(ctx context.Context, cfg *config.Config, opts ...Option) (*App, error) {
+	o := options{}
+	for _, opt := range opts {
+		opt(&o)
+	}
+
+	log := o.log
+	if log == nil {
+		log = slog.Default()
+	}
+
+	or, err := buildOrigin(ctx, cfg, o.origin, log)
+	if err != nil {
+		return nil, err
+	}
+
+	cs, err := buildCacheStore(ctx, cfg, o.cacheStore, log)
+	if err != nil {
+		return nil, err
+	}
+
+	cachestoreReady := false
+
+	if o.skipCacheSelfTest {
+		// Caller has asserted the cachestore decorator honors
+		// If-None-Match: * (the in-memory store used by tests).
+		// Treat readiness as satisfied immediately.
+		cachestoreReady = true
+	} else {
+		if err := cs.SelfTestAtomicCommit(ctx); err != nil {
+			return nil, fmt.Errorf("cachestore self-test failed: %w", err)
+		}
+
+		log.LogAttrs(ctx, slog.LevelInfo, "cachestore self-test passed")
+
+		cachestoreReady = true
+	}
+
+	clusterOpts := []cluster.Option{cluster.WithLogger(log)}
+	if o.clusterOpt != nil {
+		clusterOpts = append(clusterOpts, o.clusterOpt)
+	}
+
+	cl, err := cluster.New(ctx, cfg.Cluster, clusterOpts...)
+	if err != nil {
+		return nil, fmt.Errorf("init cluster: %w", err)
+	}
+
+	cat := chunkcatalog.New(cfg.ChunkCatalog.MaxEntries, log)
+	mc := metadata.NewCache(cfg.Metadata, log)
+	fc := fetch.NewCoordinator(or, cs, cl, cat, mc, cfg, log)
+
+	edgeHandler := server.NewEdgeHandler(fc, cfg, log)
+
+	var internalHandler http.Handler = server.NewInternalHandler(fc, cl, log)
+	if o.internalHandlerWrap != nil {
+		internalHandler = o.internalHandlerWrap(internalHandler)
+	}
+
+	edgeLn := o.edgeListener
+	if edgeLn == nil {
+		ln, err := net.Listen("tcp", cfg.Server.Listen)
+		if err != nil {
+			cleanupStartFailure(cl, nil, nil)
+
+			return nil, fmt.Errorf("edge listener bind %q: %w", cfg.Server.Listen, err)
+		}
+
+		edgeLn = ln
+	}
+
+	internalLn := o.internalListener
+	if internalLn == nil {
+		ln, err := net.Listen("tcp", cfg.Cluster.InternalListen)
+		if err != nil {
+			cleanupStartFailure(cl, edgeLn, nil)
+
+			return nil, fmt.Errorf("internal listener bind %q: %w", cfg.Cluster.InternalListen, err)
+		}
+
+		internalLn = ln
+	}
+
+	opsLn := o.opsListener
+	if opsLn == nil {
+		ln, err := net.Listen("tcp", cfg.Server.OpsListen)
+		if err != nil {
+			cleanupStartFailure(cl, edgeLn, internalLn)
+
+			return nil, fmt.Errorf("ops listener bind %q: %w", cfg.Server.OpsListen, err)
+		}
+
+		opsLn = ln
+	}
+
+	a := &App{
+		EdgeAddr:     edgeLn.Addr().String(),
+		InternalAddr: internalLn.Addr().String(),
+		OpsAddr:      opsLn.Addr().String(),
+		Cluster:      cl,
+		log:          log,
+		edgeSrv: &http.Server{
+			Handler:           edgeHandler,
+			ReadHeaderTimeout: 10 * time.Second,
+		},
+		internalSrv: &http.Server{
+			Handler:           internalHandler,
+			ReadHeaderTimeout: 10 * time.Second,
+		},
+		errCh:           make(chan error, 3),
+		cachestoreReady: cachestoreReady,
+	}
+
+	a.opsSrv = &http.Server{
+		Handler:           newOpsHandler(a.isReady),
+		ReadHeaderTimeout: 5 * time.Second,
+	}
+
+	a.wg.Add(1)
+
+	go func() {
+		defer a.wg.Done()
+
+		log.LogAttrs(ctx, slog.LevelInfo, "edge listener",
+			slog.String("addr", a.EdgeAddr),
+		)
+
+		if err := a.edgeSrv.Serve(edgeLn); err != nil && !errors.Is(err, http.ErrServerClosed) {
+			a.errCh <- fmt.Errorf("edge listener: %w", err)
+		}
+	}()
+
+	a.wg.Add(1)
+
+	go func() {
+		defer a.wg.Done()
+
+		log.LogAttrs(ctx, slog.LevelInfo, "internal listener",
+			slog.String("addr", a.InternalAddr),
+			slog.Bool("tls_enabled", cfg.Cluster.InternalTLS.Enabled),
+		)
+
+		var lerr error
+		if cfg.Cluster.InternalTLS.Enabled {
+			lerr = a.internalSrv.ServeTLS(internalLn,
+				cfg.Cluster.InternalTLS.CertFile,
+				cfg.Cluster.InternalTLS.KeyFile,
+			)
+		} else {
+			log.LogAttrs(ctx, slog.LevelWarn, "internal listener TLS DISABLED - unsafe for production",
+				slog.String("addr", a.InternalAddr),
+			)
+
+			lerr = a.internalSrv.Serve(internalLn)
+		}
+
+		if lerr != nil && !errors.Is(lerr, http.ErrServerClosed) {
+			a.errCh <- fmt.Errorf("internal listener: %w", lerr)
+		}
+	}()
+
+	a.wg.Add(1)
+
+	go func() {
+		defer a.wg.Done()
+
+		log.LogAttrs(ctx, slog.LevelInfo, "ops listener",
+			slog.String("addr", a.OpsAddr),
+		)
+
+		if err := a.opsSrv.Serve(opsLn); err != nil && !errors.Is(err, http.ErrServerClosed) {
+			a.errCh <- fmt.Errorf("ops listener: %w", err)
+		}
+	}()
+
+	return a, nil
+}
+
+// cleanupStartFailure unwinds partially-constructed Start state when
+// a subsequent step (e.g. a later net.Listen) fails. Closes any
+// listeners already bound and tells the cluster to stop its refresh
+// goroutine within a bounded budget.
+func cleanupStartFailure(cl *cluster.Cluster, listeners ...net.Listener) {
+	for _, ln := range listeners {
+		if ln == nil {
+			continue
+		}
+
+		_ = ln.Close() //nolint:errcheck // best-effort close on bind failure
+	}
+
+	closeCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	_ = cl.Close(closeCtx) //nolint:errcheck // best-effort cleanup on bind failure
+}
+
+// newOpsHandler returns the http.Handler serving /healthz and
+// /readyz for kubelet probes. /healthz is unconditional 200
+// (process-alive); /readyz returns 200 only when isReady reports
+// true. isReady is injected so tests can drive the readiness
+// signal independently of the surrounding App.
+func newOpsHandler(isReady func() bool) http.Handler {
+	mux := http.NewServeMux()
+	mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte("ok")) //nolint:errcheck // best-effort probe response
+	})
+	mux.HandleFunc("/readyz", func(w http.ResponseWriter, _ *http.Request) {
+		if !isReady() {
+			w.WriteHeader(http.StatusServiceUnavailable)
+			_, _ = w.Write([]byte("not ready")) //nolint:errcheck // best-effort probe response
+
+			return
+		}
+
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte("ready")) //nolint:errcheck // best-effort probe response
+	})
+
+	return mux
+}
+
+// isReady reports whether the app is ready to serve traffic.
+// Both conditions must hold:
+//   - cachestore self-test passed (or skipped via the test option).
+//   - cluster has loaded an initial peer-set snapshot.
+func (a *App) isReady() bool {
+	return a.cachestoreReady && a.Cluster.HasInitialSnapshot()
+}
+
+// Wait blocks until either the parent context is canceled or one of
+// the listeners exits unexpectedly. It returns the first listener
+// error (if any) or nil if ctx was canceled. Wait is intended for
+// the production "serve until SIGTERM" path; tests typically call
+// Shutdown directly.
+//
+// Any listener errors that arrive concurrently with the wait-return
+// (ctx-cancel branch or first-error branch) are drained and logged
+// at Warn so they aren't silently discarded. Without this, a
+// shutdown that overlaps with a listener failure - or a multi-
+// listener crash where two listeners errored within the same tick -
+// would lose all but the first error.
+//
+// Priority: when ctx is already canceled at the time Wait is called,
+// the ctx-cancel branch is taken deterministically even if errCh
+// also has buffered errors. Go's select non-determinism would
+// otherwise flip the return value between nil and a buffered error
+// on a tick race, contradicting the documented "nil if ctx was
+// canceled" contract. The buffered errors are still logged via
+// drainErrCh; only their effect on Wait's return value is
+// suppressed in this specific overlap.
+func (a *App) Wait(ctx context.Context) error {
+	// Non-blocking pre-check: if ctx is already canceled, take the
+	// shutdown branch without exposing the select-randomization
+	// race against any errors that may have arrived alongside the
+	// cancellation. See the function comment for rationale.
+	select {
+	case <-ctx.Done():
+		a.drainErrCh(ctx, "listener error received during shutdown")
+
+		return nil
+	default:
+	}
+
+	select {
+	case <-ctx.Done():
+		a.drainErrCh(ctx, "listener error received during shutdown")
+
+		return nil
+	case err := <-a.errCh:
+		a.drainErrCh(ctx, "additional listener error after first")
+
+		return err
+	}
+}
+
+// drainErrCh non-blockingly consumes any remaining errors from
+// a.errCh and logs them at Warn with the given message. Used by
+// Wait on both return paths to ensure no listener error is silently
+// dropped.
+func (a *App) drainErrCh(ctx context.Context, msg string) {
+	for {
+		select {
+		case err := <-a.errCh:
+			a.log.LogAttrs(ctx, slog.LevelWarn, msg,
+				slog.Any("err", err),
+			)
+		default:
+			return
+		}
+	}
+}
+
+// Shutdown gracefully stops every listener and the cluster goroutine.
+// It is safe to call multiple times; subsequent calls are no-ops.
+func (a *App) Shutdown(ctx context.Context) error {
+	var firstErr error
+
+	if err := a.edgeSrv.Shutdown(ctx); err != nil {
+		a.log.LogAttrs(ctx, slog.LevelWarn, "edge listener shutdown failed",
+			slog.Any("err", err),
+		)
+
+		firstErr = err
+	}
+
+	if err := a.internalSrv.Shutdown(ctx); err != nil {
+		a.log.LogAttrs(ctx, slog.LevelWarn, "internal listener shutdown failed",
+			slog.Any("err", err),
+		)
+
+		if firstErr == nil {
+			firstErr = err
+		}
+	}
+
+	if a.opsSrv != nil {
+		if err := a.opsSrv.Shutdown(ctx); err != nil {
+			a.log.LogAttrs(ctx, slog.LevelWarn, "ops listener shutdown failed",
+				slog.Any("err", err),
+			)
+
+			if firstErr == nil {
+				firstErr = err
+			}
+		}
+	}
+
+	if err := a.Cluster.Close(ctx); err != nil {
+		a.log.LogAttrs(ctx, slog.LevelWarn, "cluster close did not finish before ctx deadline",
+			slog.Any("err", err),
+		)
+
+		if firstErr == nil {
+			firstErr = err
+		}
+	}
+
+	a.wg.Wait()
+
+	return firstErr
+}
+
+func buildOrigin(ctx context.Context, cfg *config.Config, override origin.Origin, log *slog.Logger) (origin.Origin, error) {
+	if override != nil {
+		return override, nil
+	}
+
+	switch cfg.Origin.Driver {
+	case "azureblob":
+		or, err := azureblob.New(cfg.Origin.Azureblob, log)
+		if err != nil {
+			return nil, fmt.Errorf("init origin/azureblob: %w", err)
+		}
+
+		return or, nil
+	case "awss3":
+		or, err := awss3.New(ctx, awss3.Config{
+			Endpoint:     cfg.Origin.AWSS3.Endpoint,
+			Region:       cfg.Origin.AWSS3.Region,
+			Bucket:       cfg.Origin.AWSS3.Bucket,
+			AccessKey:    cfg.Origin.AWSS3.AccessKey,
+			SecretKey:    cfg.Origin.AWSS3.SecretKey,
+			UsePathStyle: cfg.Origin.AWSS3.UsePathStyle,
+		}, log)
+		if err != nil {
+			return nil, fmt.Errorf("init origin/awss3: %w", err)
+		}
+
+		return or, nil
+	default:
+		return nil, fmt.Errorf("unsupported origin driver: %q", cfg.Origin.Driver)
+	}
+}
+
+func buildCacheStore(ctx context.Context, cfg *config.Config, override cachestore.CacheStore, log *slog.Logger) (cachestore.CacheStore, error) {
+	if override != nil {
+		return override, nil
+	}
+
+	switch cfg.Cachestore.Driver {
+	case "s3":
+		cs, err := cachestores3.New(ctx, cachestores3.Config{
+			Endpoint:     cfg.Cachestore.S3.Endpoint,
+			Bucket:       cfg.Cachestore.S3.Bucket,
+			Region:       cfg.Cachestore.S3.Region,
+			AccessKey:    cfg.Cachestore.S3.AccessKey,
+			SecretKey:    cfg.Cachestore.S3.SecretKey,
+			UsePathStyle: cfg.Cachestore.S3.UsePathStyle,
+		}, log)
+		if err != nil {
+			return nil, fmt.Errorf("init cachestore/s3: %w", err)
+		}
+
+		return cs, nil
+	default:
+		return nil, fmt.Errorf("unsupported cachestore driver: %q", cfg.Cachestore.Driver)
+	}
+}
diff --git a/internal/orca/app/app_test.go b/internal/orca/app/app_test.go
new file mode 100644
index 00000000..37cf9ff6
--- /dev/null
+++ b/internal/orca/app/app_test.go
@@ -0,0 +1,146 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package app
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"sync/atomic"
+	"testing"
+)
+
+// TestOpsHandler_Healthz_AlwaysReturnsOK locks the contract that
+// /healthz is process-liveness only: it returns 200 unconditionally,
+// without consulting any readiness signal. Kubelet liveness probes
+// must succeed even before the app has fully bootstrapped.
+func TestOpsHandler_Healthz_AlwaysReturnsOK(t *testing.T) {
+	t.Parallel()
+
+	// readyFn is set to always-false; healthz must still 200.
+	h := newOpsHandler(func() bool { return false })
+
+	req := httptest.NewRequest(http.MethodGet, "/healthz", nil)
+	rr := httptest.NewRecorder()
+	h.ServeHTTP(rr, req)
+
+	if rr.Code != http.StatusOK {
+		t.Errorf("healthz status = %d, want %d", rr.Code, http.StatusOK)
+	}
+}
+
+// TestOpsHandler_Readyz_NotReadyReturns503 verifies that /readyz
+// surfaces 503 Service Unavailable while the readiness signal is
+// false. Kubelet readiness probes use 503 to gate Service endpoint
+// inclusion so traffic does not arrive until the app is ready.
+func TestOpsHandler_Readyz_NotReadyReturns503(t *testing.T) {
+	t.Parallel()
+
+	h := newOpsHandler(func() bool { return false })
+
+	req := httptest.NewRequest(http.MethodGet, "/readyz", nil)
+	rr := httptest.NewRecorder()
+	h.ServeHTTP(rr, req)
+
+	if rr.Code != http.StatusServiceUnavailable {
+		t.Errorf("readyz status = %d, want %d", rr.Code, http.StatusServiceUnavailable)
+	}
+}
+
+// TestOpsHandler_Readyz_ReadyReturns200 verifies the readiness
+// transition from 503 to 200 when the injected signal flips. This
+// is the bootstrap path the app drives once the cachestore
+// self-test has passed and the cluster has loaded its initial
+// peer-set snapshot.
+func TestOpsHandler_Readyz_ReadyReturns200(t *testing.T) {
+	t.Parallel()
+
+	var ready atomic.Bool
+
+	h := newOpsHandler(ready.Load)
+
+	// Initial: not ready.
+	req := httptest.NewRequest(http.MethodGet, "/readyz", nil)
+	rr := httptest.NewRecorder()
+	h.ServeHTTP(rr, req)
+
+	if rr.Code != http.StatusServiceUnavailable {
+		t.Fatalf("pre-ready readyz = %d, want %d", rr.Code, http.StatusServiceUnavailable)
+	}
+	// Flip readiness and re-probe.
+	ready.Store(true)
+
+	req = httptest.NewRequest(http.MethodGet, "/readyz", nil)
+	rr = httptest.NewRecorder()
+	h.ServeHTTP(rr, req)
+
+	if rr.Code != http.StatusOK {
+		t.Errorf("post-ready readyz = %d, want %d", rr.Code, http.StatusOK)
+	}
+}
+
+// TestApp_IsReady_RequiresCachestoreReady locks the AND-gating
+// behaviour of isReady. When cachestoreReady is false, isReady must
+// short-circuit and return false without touching the Cluster
+// pointer. Without that short-circuit a self-test failure that
+// leaves Cluster nil would panic the /readyz handler.
+func TestApp_IsReady_RequiresCachestoreReady(t *testing.T) {
+	t.Parallel()
+
+	a := &App{cachestoreReady: false}
+
+	defer func() {
+		if r := recover(); r != nil {
+			t.Fatalf("isReady panicked instead of short-circuiting on cachestoreReady=false: %v", r)
+		}
+	}()
+
+	if a.isReady() {
+		t.Errorf("isReady = true with cachestoreReady=false")
+	}
+}
+
+// TestApp_Wait_DrainsErrChOnCtxCancel verifies that listener errors
+// arriving alongside a shutdown ctx are all logged rather than only
+// the first being preserved. Pre-fills errCh with three errors,
+// then cancels ctx; Wait should drain all three to the logger.
+//
+// Regression for M-4 / the earlier app.Wait drain work; the
+// expanded drain helper now applies to both Wait return paths so a
+// multi-listener crash within a tick doesn't lose errors.
+func TestApp_Wait_DrainsErrChOnCtxCancel(t *testing.T) {
+	t.Parallel()
+
+	var buf strings.Builder
+
+	log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelWarn}))
+
+	a := &App{
+		log:   log,
+		errCh: make(chan error, 4),
+	}
+
+	a.errCh <- errors.New("edge boom")
+
+	a.errCh <- errors.New("internal boom")
+
+	a.errCh <- errors.New("ops boom")
+
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel() // ctx already cancelled when Wait starts
+
+	if err := a.Wait(ctx); err != nil {
+		t.Errorf("Wait err = %v, want nil (ctx cancelled)", err)
+	}
+
+	out := buf.String()
+	for _, want := range []string{"edge boom", "internal boom", "ops boom"} {
+		if !strings.Contains(out, want) {
+			t.Errorf("drained log missing %q; got %q", want, out)
+		}
+	}
+}
diff --git a/internal/orca/cachestore/cachestore.go b/internal/orca/cachestore/cachestore.go
new file mode 100644
index 00000000..9b99f5df
--- /dev/null
+++ b/internal/orca/cachestore/cachestore.go
@@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package cachestore defines the in-DC chunk store interface and shared
+// types. Concrete drivers live under cachestore/<driver>/.
+//
+// All drivers must implement atomic commit (CAS-style PutChunk that
+// rejects overwrites) so concurrent fills across replicas converge
+// without clobbering each other; SelfTestAtomicCommit is run at boot
+// to verify the backend honors the precondition.
+package cachestore
+
+import (
+	"context"
+	"errors"
+	"io"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+)
+
+// CacheStore is where chunk bytes physically live. Source of truth for
+// chunk presence; backed by an in-DC S3-like store in production and
+// LocalStack in dev.
+type CacheStore interface {
+	GetChunk(ctx context.Context, k chunk.Key, off, n int64) (io.ReadCloser, error)
+	PutChunk(ctx context.Context, k chunk.Key, size int64, r io.Reader) error
+	Stat(ctx context.Context, k chunk.Key) (Info, error)
+	Delete(ctx context.Context, k chunk.Key) error
+	SelfTestAtomicCommit(ctx context.Context) error
+}
+
+// Info is the result of a successful Stat.
+type Info struct {
+	Size      int64
+	Committed time.Time
+}
+
+// Sentinel errors. Wrap with %w so callers use errors.Is.
+var (
+	ErrNotFound   = errors.New("cachestore: not found")
+	ErrTransient  = errors.New("cachestore: transient")
+	ErrAuth       = errors.New("cachestore: auth")
+	ErrCommitLost = errors.New("cachestore: commit lost (no-clobber denied)")
+)
diff --git a/internal/orca/cachestore/s3/s3.go b/internal/orca/cachestore/s3/s3.go
new file mode 100644
index 00000000..50e46668
--- /dev/null
+++ b/internal/orca/cachestore/s3/s3.go
@@ -0,0 +1,528 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package s3 is the cachestore driver for in-DC S3-compatible stores.
+// In production this targets VAST or another S3-compatible object
+// store; in dev it targets LocalStack.
+//
+// Atomic commit is implemented via PutObject + If-None-Match: * (s3
+// conditional writes). The boot SelfTestAtomicCommit verifies the
+// backend honors the precondition; the boot versioning gate verifies
+// the bucket is not versioned (since If-None-Match is not honored on
+// versioned buckets).
+package s3
+
+import (
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+
+	"github.com/aws/aws-sdk-go-v2/aws"
+	awshttp "github.com/aws/aws-sdk-go-v2/aws/transport/http"
+	awsconfig "github.com/aws/aws-sdk-go-v2/config"
+	"github.com/aws/aws-sdk-go-v2/credentials"
+	"github.com/aws/aws-sdk-go-v2/service/s3"
+	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
+	"github.com/aws/smithy-go"
+
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	"github.com/Azure/unbounded/internal/orca/chunk"
+)
+
+// Driver implements cachestore.CacheStore against an S3-compatible
+// endpoint.
+type Driver struct {
+	client *s3.Client
+	bucket string
+	log    *slog.Logger
+}
+
+// Config is the s3-driver configuration. Mirrors config.CachestoreS3
+// but kept package-local so the driver can be unit-tested without
+// importing the whole config package.
+type Config struct {
+	Endpoint     string
+	Bucket       string
+	Region       string
+	AccessKey    string
+	SecretKey    string
+	UsePathStyle bool
+}
+
+// New constructs a Driver. The bucket-versioning gate is run here
+// unconditionally: a versioned bucket silently breaks the no-clobber
+// atomic-commit primitive (PutObject + If-None-Match: *) so the
+// driver refuses to start against one.
+//
+// The log receives debug-level emissions for every chunk operation
+// (Get, Put, Stat, Delete) and step-by-step boot trace from
+// SelfTestAtomicCommit / versioningGate. Passing nil falls back to
+// slog.Default().
+//
+// SelfTestAtomicCommit is a separate step (called by main after New)
+// to keep the constructor side-effect-light.
+func New(ctx context.Context, cfg Config, log *slog.Logger) (*Driver, error) {
+	if cfg.Bucket == "" {
+		return nil, fmt.Errorf("cachestore/s3: bucket required")
+	}
+
+	if cfg.Endpoint == "" {
+		return nil, fmt.Errorf("cachestore/s3: endpoint required")
+	}
+
+	awsCfg, err := awsconfig.LoadDefaultConfig(ctx,
+		awsconfig.WithRegion(cfg.Region),
+		awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(
+			cfg.AccessKey, cfg.SecretKey, "",
+		)),
+		// Opt out of CRC64NVME default introduced in aws-sdk-go-v2
+		// 1.32. LocalStack 3.8 returns InvalidRequest for unknown
+		// algorithms; real AWS S3 still works either way.
+		awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired),
+		awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("cachestore/s3: aws config: %w", err)
+	}
+
+	client := s3.NewFromConfig(awsCfg, func(o *s3.Options) {
+		o.BaseEndpoint = aws.String(cfg.Endpoint)
+		o.UsePathStyle = cfg.UsePathStyle
+	})
+
+	if log == nil {
+		log = slog.Default()
+	}
+
+	d := &Driver{
+		client: client,
+		bucket: cfg.Bucket,
+		log:    log,
+	}
+
+	if err := d.versioningGate(ctx); err != nil {
+		return nil, err
+	}
+
+	return d, nil
+}
+
+// versioningGate refuses to start if the bucket has versioning enabled
+// or suspended. If-None-Match: * is not honored against versioned
+// buckets, which would silently break atomic commit's no-clobber
+// guarantee.
+func (d *Driver) versioningGate(ctx context.Context) error {
+	d.log.LogAttrs(ctx, slog.LevelDebug, "versioning_gate_probe",
+		slog.String("bucket", d.bucket),
+	)
+
+	out, err := d.client.GetBucketVersioning(ctx, &s3.GetBucketVersioningInput{
+		Bucket: aws.String(d.bucket),
+	})
+	if err != nil {
+		return fmt.Errorf("cachestore/s3: GetBucketVersioning failed: %w", err)
+	}
+
+	d.log.LogAttrs(ctx, slog.LevelDebug, "versioning_gate_status",
+		slog.String("bucket", d.bucket),
+		slog.String("status", string(out.Status)),
+	)
+
+	return validateBucketVersioning(d.bucket, out.Status)
+}
+
+// validateBucketVersioning returns an error if the bucket's versioning
+// status is incompatible with cachestore/s3's atomic-commit primitive.
+// Extracted as a pure function so unit tests can cover all branches
+// (empty / Enabled / Suspended) without round-tripping to a real or
+// emulated S3 backend.
+func validateBucketVersioning(bucket string, status s3types.BucketVersioningStatus) error {
+	switch status {
+	case s3types.BucketVersioningStatusEnabled, s3types.BucketVersioningStatusSuspended:
+		return fmt.Errorf(
+			"cachestore/s3: bucket %s has versioning %s; If-None-Match: * is not "+
+				"honored on versioned buckets and the atomic-commit primitive cannot "+
+				"guarantee no-clobber; disable bucket versioning to use cachestore/s3",
+			bucket, status)
+	}
+
+	return nil
+}
+
+// SelfTestAtomicCommit verifies the backend honors PutObject +
+// If-None-Match: *.
+func (d *Driver) SelfTestAtomicCommit(ctx context.Context) error {
+	suffix, err := randHex(16)
+	if err != nil {
+		return fmt.Errorf("cachestore/s3 self-test: generate probe key: %w", err)
+	}
+
+	probeKey := fmt.Sprintf("_orca-selftest/%s", suffix)
+	body := []byte("orca-selftest")
+
+	d.log.LogAttrs(ctx, slog.LevelDebug, "selftest_first_put",
+		slog.String("bucket", d.bucket),
+		slog.String("probe_key", probeKey),
+	)
+
+	// First put: must succeed.
+	_, err = d.client.PutObject(ctx, &s3.PutObjectInput{
+		Bucket:      aws.String(d.bucket),
+		Key:         aws.String(probeKey),
+		Body:        bytes.NewReader(body),
+		IfNoneMatch: aws.String("*"),
+	})
+	if err != nil {
+		return fmt.Errorf("cachestore/s3 self-test: first put failed: %w", err)
+	}
+
+	d.log.LogAttrs(ctx, slog.LevelDebug, "selftest_second_put_expecting_412",
+		slog.String("bucket", d.bucket),
+		slog.String("probe_key", probeKey),
+	)
+
+	// Second put: must fail with 412.
+	_, err = d.client.PutObject(ctx, &s3.PutObjectInput{
+		Bucket:      aws.String(d.bucket),
+		Key:         aws.String(probeKey),
+		Body:        bytes.NewReader(body),
+		IfNoneMatch: aws.String("*"),
+	})
+	if err == nil {
+		// Clean up before returning the failure.
+		_, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup
+			Bucket: aws.String(d.bucket),
+			Key:    aws.String(probeKey),
+		})
+
+		return fmt.Errorf(
+			"cachestore/s3: backend does not honor If-None-Match: *; refusing to start " +
+				"(second concurrent put returned 200 instead of 412)")
+	}
+
+	if !isPreconditionFailed(err) {
+		_, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup
+			Bucket: aws.String(d.bucket),
+			Key:    aws.String(probeKey),
+		})
+
+		return fmt.Errorf("cachestore/s3 self-test: second put returned unexpected error "+
+			"(want 412 PreconditionFailed): %w", err)
+	}
+
+	d.log.LogAttrs(ctx, slog.LevelDebug, "selftest_second_put_rejected_412",
+		slog.String("bucket", d.bucket),
+		slog.String("probe_key", probeKey),
+	)
+
+	// Cleanup probe key.
+	_, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup
+		Bucket: aws.String(d.bucket),
+		Key:    aws.String(probeKey),
+	})
+
+	return nil
+}
+
+// GetChunk fetches [off, off+n) of the chunk path from the bucket.
+//
+// Rejects n <= 0 with a sentinel ErrInvalidArgument: the wire-format
+// boundary (cluster.DecodeChunkKey) already rejects object_size <= 0,
+// so an in-process caller asking for a zero-length read is a logic
+// bug. Forwarding the request would yield a malformed S3 Range
+// header (bytes=0--1).
+func (d *Driver) GetChunk(ctx context.Context, k chunk.Key, off, n int64) (io.ReadCloser, error) {
+	if n <= 0 {
+		return nil, fmt.Errorf("cachestore/s3 get: n must be > 0, got %d", n)
+	}
+
+	if off < 0 {
+		return nil, fmt.Errorf("cachestore/s3 get: off must be >= 0, got %d", off)
+	}
+
+	rng := fmt.Sprintf("bytes=%d-%d", off, off+n-1)
+
+	d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_get_chunk",
+		csChunkAttrs(k),
+		slog.Int64("off", off),
+		slog.Int64("n", n),
+	)
+
+	out, err := d.client.GetObject(ctx, &s3.GetObjectInput{
+		Bucket: aws.String(d.bucket),
+		Key:    aws.String(k.Path()),
+		Range:  aws.String(rng),
+	})
+	if err != nil {
+		mapped := mapErr(err)
+		d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_get_chunk_err",
+			csChunkAttrs(k),
+			slog.Any("err", mapped),
+		)
+
+		return nil, mapped
+	}
+
+	return out.Body, nil
+}
+
+// PutChunk uploads the chunk via PutObject + If-None-Match: *. On
+// 412 returns ErrCommitLost (loser of an atomic-commit race).
+//
+// Rejects size <= 0 with a sentinel error: a zero-byte chunk is
+// never a legitimate fill result (the wire-format boundary already
+// rejects object_size <= 0, and the smallest legitimate tail chunk
+// is 1 byte), and uploading a zero-byte object would poison the
+// path so later GetChunk(n=expected) reads return 0 bytes and break
+// the streaming model.
+func (d *Driver) PutChunk(ctx context.Context, k chunk.Key, size int64, r io.Reader) error {
+	if size <= 0 {
+		return fmt.Errorf("cachestore/s3 put: size must be > 0, got %d", size)
+	}
+	// AWS SDK v2 needs an io.ReadSeeker for unsigned-payload uploads
+	// (so it can rewind on signed-retry). If the caller already passed
+	// a seekable reader we hand it to the SDK directly; otherwise
+	// buffer the bytes ourselves as a fallback.
+	body, ok := r.(io.ReadSeeker)
+	if !ok {
+		buf, err := io.ReadAll(r)
+		if err != nil {
+			return fmt.Errorf("cachestore/s3 put: read body: %w", err)
+		}
+		// Validate the actual byte count against the caller's
+		// claimed size.
+		if int64(len(buf)) != size {
+			return fmt.Errorf("cachestore/s3 put: short body (got %d want %d)", len(buf), size)
+		}
+
+		body = bytes.NewReader(buf)
+	} else {
+		// Seekable-path size validation: probe the reader's length
+		// via Seek(0, End), confirm it matches the declared size,
+		// then rewind to position 0 for the upload. Without this
+		// guard, a buggy caller passing a Reader of length M with
+		// size=N would either be rejected by S3 (ContentLength
+		// mismatch) or upload a truncated / overlong blob,
+		// depending on backend behaviour. The wire-format boundary
+		// already rejects size <= 0; this catches the size > 0 but
+		// mismatched-bytes case at the driver entry point.
+		end, err := body.Seek(0, io.SeekEnd)
+		if err != nil {
+			return fmt.Errorf("cachestore/s3 put: seek-end: %w", err)
+		}
+
+		if end != size {
+			return fmt.Errorf("cachestore/s3 put: seekable reader length %d does not match size %d", end, size)
+		}
+
+		if _, err := body.Seek(0, io.SeekStart); err != nil {
+			return fmt.Errorf("cachestore/s3 put: seek-rewind: %w", err)
+		}
+	}
+
+	d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_put_chunk",
+		csChunkAttrs(k),
+		slog.Int64("size", size),
+	)
+
+	_, err := d.client.PutObject(ctx, &s3.PutObjectInput{
+		Bucket:        aws.String(d.bucket),
+		Key:           aws.String(k.Path()),
+		Body:          body,
+		ContentLength: aws.Int64(size),
+		IfNoneMatch:   aws.String("*"),
+	})
+	if err != nil {
+		if isPreconditionFailed(err) {
+			d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_put_commit_lost",
+				csChunkAttrs(k),
+			)
+
+			return cachestore.ErrCommitLost
+		}
+
+		mapped := mapErr(err)
+		d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_put_err",
+			csChunkAttrs(k),
+			slog.Any("err", mapped),
+		)
+
+		return mapped
+	}
+
+	d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_put_success",
+		csChunkAttrs(k),
+		slog.Int64("size", size),
+	)
+
+	return nil
+}
+
+// Stat checks for chunk presence.
+func (d *Driver) Stat(ctx context.Context, k chunk.Key) (cachestore.Info, error) {
+	out, err := d.client.HeadObject(ctx, &s3.HeadObjectInput{
+		Bucket: aws.String(d.bucket),
+		Key:    aws.String(k.Path()),
+	})
+	if err != nil {
+		mapped := mapErr(err)
+		// ErrNotFound is the expected 'miss' result for Stat; logged
+		// at the same debug level as the hit path so cache-hit-rate
+		// diagnostics can count both.
+		d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_stat_result",
+			csChunkAttrs(k),
+			slog.Bool("present", false),
+			slog.Any("err", mapped),
+		)
+
+		return cachestore.Info{}, mapped
+	}
+
+	info := cachestore.Info{}
+	if out.ContentLength != nil {
+		info.Size = *out.ContentLength
+	}
+
+	if out.LastModified != nil {
+		info.Committed = *out.LastModified
+	}
+
+	d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_stat_result",
+		csChunkAttrs(k),
+		slog.Bool("present", true),
+		slog.Int64("size", info.Size),
+	)
+
+	return info, nil
+}
+
+// Delete removes the chunk; idempotent.
+func (d *Driver) Delete(ctx context.Context, k chunk.Key) error {
+	d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_delete",
+		csChunkAttrs(k),
+	)
+
+	_, err := d.client.DeleteObject(ctx, &s3.DeleteObjectInput{
+		Bucket: aws.String(d.bucket),
+		Key:    aws.String(k.Path()),
+	})
+	if err != nil {
+		if isNotFound(err) {
+			return nil
+		}
+
+		return mapErr(err)
+	}
+
+	return nil
+}
+
+// csChunkAttrs renders the chunk's identifying tuple as a slog
+// group attribute matching the cross-package 'chunk' taxonomy used
+// by fetch.Coordinator and chunkcatalog. Operator queries can grep
+// on a single attribute path across the request lifecycle.
+func csChunkAttrs(k chunk.Key) slog.Attr {
+	return slog.Group("chunk",
+		slog.String("origin_id", k.OriginID),
+		slog.String("bucket", k.Bucket),
+		slog.String("key", k.ObjectKey),
+		slog.Int64("index", k.Index),
+	)
+}
+
+func randHex(n int) (string, error) {
+	b := make([]byte, n)
+	if _, err := rand.Read(b); err != nil {
+		// crypto/rand failure is extraordinary on Linux. Surface it
+		// to the selftest caller rather than masking with a
+		// time-based fallback: a fallback could collide on parallel
+		// boots and silently fail the first-put precondition, and
+		// the underlying entropy / sandbox issue is operator-
+		// actionable in its own right.
+		return "", fmt.Errorf("cachestore/s3: rand.Read: %w", err)
+	}
+
+	return hex.EncodeToString(b), nil
+}
+
+// isPreconditionFailed reports whether err represents a 412
+// Precondition Failed response from S3. The atomic-commit primitive
+// (PutObject + If-None-Match: *) returns 412 when the key already
+// exists; the SelfTest path also expects 412 on the duplicate put.
+// We use the HTTP status code carried on *awshttp.ResponseError
+// rather than matching service error codes by string, since the
+// code surface is version-dependent across SDK and backend
+// implementations whereas the HTTP status code is part of the
+// stable wire contract.
+func isPreconditionFailed(err error) bool {
+	var respErr *awshttp.ResponseError
+	if errors.As(err, &respErr) && respErr.Response != nil {
+		return respErr.Response.StatusCode == http.StatusPreconditionFailed
+	}
+
+	return false
+}
+
+func isNotFound(err error) bool {
+	var nsk *s3types.NoSuchKey
+	if errors.As(err, &nsk) {
+		return true
+	}
+
+	var nsb *s3types.NoSuchBucket
+	if errors.As(err, &nsb) {
+		return true
+	}
+
+	var notFound *s3types.NotFound
+	if errors.As(err, &notFound) {
+		return true
+	}
+
+	var respErr *awshttp.ResponseError
+	if errors.As(err, &respErr) && respErr.Response != nil &&
+		respErr.Response.StatusCode == http.StatusNotFound {
+		return true
+	}
+
+	return false
+}
+
+// mapErr normalises driver errors to the cachestore sentinel
+// taxonomy. AccessDenied / Forbidden / Unauthorized are surfaced by
+// the SDK with stable smithy.APIError codes so we keep that match
+// path; everything else routes through HTTP status code on the
+// underlying *awshttp.ResponseError.
+func mapErr(err error) error {
+	if isNotFound(err) {
+		return cachestore.ErrNotFound
+	}
+
+	var apiErr smithy.APIError
+	if errors.As(err, &apiErr) {
+		switch apiErr.ErrorCode() {
+		case "AccessDenied", "Unauthorized", "Forbidden", "InvalidAccessKeyId", "SignatureDoesNotMatch":
+			return cachestore.ErrAuth
+		}
+	}
+
+	var respErr *awshttp.ResponseError
+	if errors.As(err, &respErr) && respErr.Response != nil {
+		status := respErr.Response.StatusCode
+		if status == http.StatusUnauthorized || status == http.StatusForbidden {
+			return cachestore.ErrAuth
+		}
+
+		if status >= 500 && status < 600 {
+			return cachestore.ErrTransient
+		}
+	}
+
+	return err
+}
diff --git a/internal/orca/cachestore/s3/s3_test.go b/internal/orca/cachestore/s3/s3_test.go
new file mode 100644
index 00000000..95466acf
--- /dev/null
+++ b/internal/orca/cachestore/s3/s3_test.go
@@ -0,0 +1,235 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package s3
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"net/http"
+	"testing"
+
+	awshttp "github.com/aws/aws-sdk-go-v2/aws/transport/http"
+	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
+	smithy "github.com/aws/smithy-go"
+	smithyhttp "github.com/aws/smithy-go/transport/http"
+
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	"github.com/Azure/unbounded/internal/orca/chunk"
+)
+
+// makeResponseErr builds an *awshttp.ResponseError wrapping the
+// given HTTP status code. Mirrors how the AWS SDK surfaces service
+// errors to callers: an *awshttp.ResponseError nesting a
+// *smithyhttp.ResponseError that carries the HTTP response.
+func makeResponseErr(status int, inner error) *awshttp.ResponseError {
+	return &awshttp.ResponseError{
+		ResponseError: &smithyhttp.ResponseError{
+			Response: &smithyhttp.Response{
+				Response: &http.Response{StatusCode: status},
+			},
+			Err: inner,
+		},
+	}
+}
+
+// TestIsPreconditionFailed_FromHTTPStatus verifies that 412 alone
+// signals precondition failure; other statuses (and errors lacking
+// HTTP-response context) do not. The original implementation matched
+// service error codes by string ("PreconditionFailed",
+// "InvalidArgument", "ConditionalRequestConflict") plus substring
+// "412" - fragile across SDK versions and backend implementations.
+func TestIsPreconditionFailed_FromHTTPStatus(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		err  error
+		want bool
+	}{
+		{"412 ResponseError -> true", makeResponseErr(412, errors.New("precondition")), true},
+		{"500 ResponseError -> false", makeResponseErr(500, errors.New("ise")), false},
+		{"404 ResponseError -> false", makeResponseErr(404, errors.New("not found")), false},
+		{"plain error -> false", errors.New("StatusCode: 412 something"), false},
+		{"nil -> false", nil, false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isPreconditionFailed(tt.err); got != tt.want {
+				t.Errorf("isPreconditionFailed = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+// TestIsNotFound covers the typed-error and HTTP-status branches.
+func TestIsNotFound(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		err  error
+		want bool
+	}{
+		{"NoSuchKey typed", &s3types.NoSuchKey{}, true},
+		{"NoSuchBucket typed", &s3types.NoSuchBucket{}, true},
+		{"NotFound typed", &s3types.NotFound{}, true},
+		{"404 ResponseError", makeResponseErr(404, errors.New("not found")), true},
+		{"500 ResponseError", makeResponseErr(500, errors.New("ise")), false},
+		{"plain error", errors.New("random"), false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isNotFound(tt.err); got != tt.want {
+				t.Errorf("isNotFound = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+// fakeAPIError implements smithy.APIError for testing the
+// AccessDenied / Forbidden mapping path.
+type fakeAPIError struct{ code string }
+
+func (e *fakeAPIError) Error() string                 { return e.code }
+func (e *fakeAPIError) ErrorCode() string             { return e.code }
+func (e *fakeAPIError) ErrorMessage() string          { return e.code }
+func (e *fakeAPIError) ErrorFault() smithy.ErrorFault { return smithy.FaultUnknown }
+func (e *fakeAPIError) HTTPStatusCode() int           { return 0 }
+
+// TestMapErr covers the full mapping table: 404 / typed not-found
+// -> ErrNotFound, AccessDenied APIError -> ErrAuth, 5xx ->
+// ErrTransient, anything else passes through.
+func TestMapErr(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		err  error
+		want error
+	}{
+		{"NoSuchKey -> ErrNotFound", &s3types.NoSuchKey{}, cachestore.ErrNotFound},
+		{"404 ResponseError -> ErrNotFound", makeResponseErr(404, errors.New("nf")), cachestore.ErrNotFound},
+		{"AccessDenied APIError -> ErrAuth", &fakeAPIError{code: "AccessDenied"}, cachestore.ErrAuth},
+		{"InvalidAccessKeyId APIError -> ErrAuth", &fakeAPIError{code: "InvalidAccessKeyId"}, cachestore.ErrAuth},
+		{"403 ResponseError -> ErrAuth", makeResponseErr(403, errors.New("denied")), cachestore.ErrAuth},
+		{"401 ResponseError -> ErrAuth", makeResponseErr(401, errors.New("unauth")), cachestore.ErrAuth},
+		{"500 ResponseError -> ErrTransient", makeResponseErr(500, errors.New("ise")), cachestore.ErrTransient},
+		{"503 ResponseError -> ErrTransient", makeResponseErr(503, errors.New("unavail")), cachestore.ErrTransient},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := mapErr(tt.err)
+			if !errors.Is(got, tt.want) {
+				t.Errorf("mapErr = %v, want errors.Is(_, %v) true", got, tt.want)
+			}
+		})
+	}
+}
+
+// TestMapErr_PassthroughUnknown verifies that unrecognized errors
+// pass through unchanged.
+func TestMapErr_PassthroughUnknown(t *testing.T) {
+	t.Parallel()
+
+	src := errors.New("unrecognized")
+	if got := mapErr(src); got != src {
+		t.Errorf("mapErr(unknown) = %v, want passthrough %v", got, src)
+	}
+}
+
+// TestGetChunk_RejectsZeroN verifies that GetChunk refuses n <= 0.
+// Forwarding such a request would produce a malformed S3 Range
+// header (bytes=0--1) which the backend rejects with InvalidArgument.
+// The wire-format boundary (cluster.DecodeChunkKey) already rejects
+// object_size <= 0, so an in-process caller reaching this with n <= 0
+// is a logic bug we want surfaced as an explicit error.
+//
+// Regression for C-2.
+func TestGetChunk_RejectsZeroN(t *testing.T) {
+	t.Parallel()
+
+	d := &Driver{}
+
+	tests := []struct {
+		name string
+		off  int64
+		n    int64
+	}{
+		{"n zero", 0, 0},
+		{"n negative", 0, -1},
+		{"off negative", -1, 1024},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			_, err := d.GetChunk(context.Background(), chunkPathOnlyKey(), tt.off, tt.n)
+			if err == nil {
+				t.Errorf("GetChunk(off=%d, n=%d) returned nil; want error", tt.off, tt.n)
+			}
+		})
+	}
+}
+
+// TestPutChunk_RejectsZeroSize verifies that PutChunk refuses
+// size <= 0. A zero-byte commit would poison the path with a
+// 0-byte blob and subsequent GetChunk(n=expected) reads would
+// either error or stream zero bytes.
+//
+// Regression for C-3.
+func TestPutChunk_RejectsZeroSize(t *testing.T) {
+	t.Parallel()
+
+	d := &Driver{}
+
+	for _, size := range []int64{0, -1} {
+		if err := d.PutChunk(context.Background(), chunkPathOnlyKey(), size, nil); err == nil {
+			t.Errorf("PutChunk(size=%d) returned nil; want error", size)
+		}
+	}
+}
+
+// chunkPathOnlyKey returns a minimal chunk.Key whose Path() can be
+// computed; used by the GetChunk / PutChunk guard tests that error
+// before any S3 round-trip.
+func chunkPathOnlyKey() chunk.Key {
+	return chunk.Key{
+		OriginID:  "ox",
+		Bucket:    "b",
+		ObjectKey: "o",
+		ETag:      "e1",
+		ChunkSize: 1024,
+		Index:     0,
+	}
+}
+
+// TestPutChunk_SeekableSizeMismatch verifies that PutChunk rejects
+// a seekable reader whose actual length does not match the declared
+// size. Without the seekable-path probe, a buggy caller passing a
+// Reader of length M with size=N would either be rejected by S3
+// (ContentLength mismatch) or upload a wrong-sized blob.
+//
+// Regression for H-6.
+func TestPutChunk_SeekableSizeMismatch(t *testing.T) {
+	t.Parallel()
+
+	d := &Driver{}
+
+	// Reader has 10 bytes, but caller claims 1024. PutChunk must
+	// fail at the seek-and-check probe before any RPC.
+	r := bytes.NewReader(make([]byte, 10))
+	if err := d.PutChunk(context.Background(), chunkPathOnlyKey(), 1024, r); err == nil {
+		t.Errorf("PutChunk accepted seekable reader with size mismatch")
+	}
+
+	// Reader has 100 bytes, caller claims 50: also a mismatch
+	// (caller would upload only 50, leaving 50 unread).
+	r = bytes.NewReader(make([]byte, 100))
+	if err := d.PutChunk(context.Background(), chunkPathOnlyKey(), 50, r); err == nil {
+		t.Errorf("PutChunk accepted seekable reader longer than declared size")
+	}
+}
diff --git a/internal/orca/chunk/chunk.go b/internal/orca/chunk/chunk.go
new file mode 100644
index 00000000..8a2eb3bd
--- /dev/null
+++ b/internal/orca/chunk/chunk.go
@@ -0,0 +1,213 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package chunk implements the chunk model: ChunkKey, deterministic
+// path encoding, and the range -> chunk-index iterator.
+package chunk
+
+import (
+	"crypto/sha256"
+	"encoding/binary"
+	"encoding/hex"
+	"fmt"
+	"hash"
+)
+
+// Key is the immutable identifier for a chunk.
+//
+// Path encoding:
+//
+//	LP(s)   = LE64(uint64(len(s))) || s
+//	hashKey = sha256(
+//	            LP(origin_id) ||
+//	            LP(bucket)    ||
+//	            LP(key)       ||
+//	            LP(etag)      ||
+//	            LE64(chunk_size)
+//	          )
+//	path    = "<origin_id>/<hex(hashKey)>/<chunk_index>"
+type Key struct {
+	OriginID  string
+	Bucket    string
+	ObjectKey string
+	ETag      string
+	ChunkSize int64
+	Index     int64
+}
+
+// Path returns the canonical on-store path for this ChunkKey.
+func (k Key) Path() string {
+	h := sha256.New()
+	writeLP(h, k.OriginID)
+	writeLP(h, k.Bucket)
+	writeLP(h, k.ObjectKey)
+	writeLP(h, k.ETag)
+
+	var sizeBuf [8]byte
+	binary.LittleEndian.PutUint64(sizeBuf[:], uint64(k.ChunkSize))
+	h.Write(sizeBuf[:])
+	sum := h.Sum(nil)
+
+	return fmt.Sprintf("%s/%s/%d", k.OriginID, hex.EncodeToString(sum), k.Index)
+}
+
+// Range returns the byte range [Off, Off+Len) within the origin
+// object that this chunk corresponds to.
+func (k Key) Range() (off, length int64) {
+	off = k.Index * k.ChunkSize
+	length = k.ChunkSize
+
+	return off, length
+}
+
+// ExpectedLen returns the authoritative number of bytes this chunk
+// should contain given the object's total size. For non-tail chunks
+// this is k.ChunkSize; for the tail chunk it is the remainder. If
+// objectSize is zero or negative (unknown), returns k.ChunkSize. If
+// the chunk is entirely past the end of the object, returns 0.
+func (k Key) ExpectedLen(objectSize int64) int64 {
+	if objectSize <= 0 {
+		return k.ChunkSize
+	}
+
+	off := k.Index * k.ChunkSize
+	if off >= objectSize {
+		return 0
+	}
+
+	remaining := objectSize - off
+	if remaining < k.ChunkSize {
+		return remaining
+	}
+
+	return k.ChunkSize
+}
+
+// String renders the key compactly for logging.
+func (k Key) String() string {
+	if len(k.ETag) > 8 {
+		return fmt.Sprintf("ChunkKey{%s/%s/%s..@%d#%d}",
+			k.OriginID, k.Bucket, k.ObjectKey, k.Index, len(k.ETag))
+	}
+
+	return fmt.Sprintf("ChunkKey{%s/%s/%s@%d}", k.OriginID, k.Bucket, k.ObjectKey, k.Index)
+}
+
+func writeLP(h hash.Hash, s string) {
+	var lenBuf [8]byte
+	binary.LittleEndian.PutUint64(lenBuf[:], uint64(len(s)))
+	h.Write(lenBuf[:])
+	h.Write([]byte(s))
+}
+
+// IndexRange returns the inclusive [first, last] chunk indices that
+// cover the byte range [start, end] of an object whose total size is
+// objectSize.
+//
+// Inputs:
+//   - start, end: requested byte range (inclusive on both ends).
+//     Both must be >= 0 under normal use.
+//   - chunkSize: > 0; the configured chunk size.
+//   - objectSize: > 0 for any meaningful call. Empty-object callers
+//     should not invoke IndexRange; the server short-circuits to
+//     200 + empty body upstream.
+//
+// Clamping behaviour:
+//   - end >= objectSize is clamped to objectSize - 1.
+//   - end < 0 is defensively clamped to 0 (returns first=0, last=0,
+//     meaning "chunk 0" - the caller must already have prevented
+//     reaching this branch in normal flow; the clamp is a guard
+//     against an arithmetic bug elsewhere, not a supported empty-
+//     range encoding).
+//
+// The function does not validate chunkSize > 0; a zero or negative
+// chunkSize panics with a runtime division-by-zero. The config
+// validation at startup (chunking.size minimum 1 MiB) guarantees
+// this invariant in production.
+func IndexRange(start, end, chunkSize, objectSize int64) (first, last int64) {
+	if end >= objectSize {
+		end = objectSize - 1
+	}
+
+	if end < 0 {
+		end = 0
+	}
+
+	first = start / chunkSize
+	last = end / chunkSize
+
+	return first, last
+}
+
+// Tier is one entry in the chunk-size policy: objects with size
+// >= MinObjectSize use ChunkSize, unless a higher-threshold tier
+// also matches (in which case the higher tier wins).
+//
+// Tiers form an ascending-threshold ladder that overrides a base
+// chunk size for sufficiently large objects, letting operators
+// trade per-chunk HTTP overhead against per-fill memory for big
+// blobs without changing the storage layout. See SizeFor for the
+// selection rule.
+type Tier struct {
+	MinObjectSize int64
+	ChunkSize     int64
+}
+
+// SizeFor returns the chunk size to use for an object of objectSize
+// bytes. tiers must be strictly ascending by MinObjectSize; callers
+// are responsible for validating this at config load time.
+// objectSize <= 0 (unknown) returns base unchanged so that callers
+// without a HEAD-resolved size still get a valid chunk size.
+//
+// Selection rule: walk tiers in ascending threshold order and pick
+// the last tier whose MinObjectSize <= objectSize. If no tier
+// matches (objectSize is smaller than the smallest threshold, or
+// tiers is empty), the base size is returned. Ties on a tier
+// boundary are inclusive of the lower bound: an object of size
+// exactly MinObjectSize uses that tier's ChunkSize.
+func SizeFor(objectSize, base int64, tiers []Tier) int64 {
+	if objectSize <= 0 {
+		return base
+	}
+
+	chosen := base
+
+	for _, t := range tiers {
+		if t.MinObjectSize > objectSize {
+			// Tiers are sorted ascending; no later tier can match.
+			break
+		}
+
+		chosen = t.ChunkSize
+	}
+
+	return chosen
+}
+
+// ChunkSlice returns the [off, len) within a single chunk that
+// satisfies the original client byte range [start, end].
+//
+// chunkIdx is the chunk index. chunkSize is the configured chunk size.
+// objectSize is the total origin-object size (used to clamp the last
+// chunk if it is partial).
+func ChunkSlice(chunkIdx, chunkSize, start, end, objectSize int64) (off, length int64) {
+	chunkStart := chunkIdx * chunkSize
+
+	chunkEnd := chunkStart + chunkSize - 1
+	if chunkEnd >= objectSize {
+		chunkEnd = objectSize - 1
+	}
+
+	if start > chunkStart {
+		off = start - chunkStart
+	}
+
+	sliceEnd := chunkEnd
+	if end < chunkEnd {
+		sliceEnd = end
+	}
+
+	length = sliceEnd - chunkStart - off + 1
+
+	return off, length
+}
diff --git a/internal/orca/chunk/chunk_test.go b/internal/orca/chunk/chunk_test.go
new file mode 100644
index 00000000..cfed7dcb
--- /dev/null
+++ b/internal/orca/chunk/chunk_test.go
@@ -0,0 +1,379 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package chunk
+
+import (
+	"strings"
+	"testing"
+)
+
+// TestKey_ExpectedLen covers the per-chunk expected length given an
+// object size: full chunks for non-tail, remainder for the tail, 0 for
+// past-end, k.ChunkSize when objectSize is unknown (<= 0).
+func TestKey_ExpectedLen(t *testing.T) {
+	t.Parallel()
+
+	const cs = int64(1024)
+
+	tests := []struct {
+		name       string
+		k          Key
+		objectSize int64
+		want       int64
+	}{
+		{"full chunk 0", Key{ChunkSize: cs, Index: 0}, 4096, cs},
+		{"full chunk 2", Key{ChunkSize: cs, Index: 2}, 4096, cs},
+		{"tail chunk partial", Key{ChunkSize: cs, Index: 3}, 3500, 3500 - 3072},
+		{"chunk exactly fills object", Key{ChunkSize: cs, Index: 3}, 4096, cs},
+		{"chunk past end returns 0", Key{ChunkSize: cs, Index: 5}, 3500, 0},
+		{"objectSize 0 -> ChunkSize (unknown)", Key{ChunkSize: cs, Index: 0}, 0, cs},
+		{"objectSize negative -> ChunkSize", Key{ChunkSize: cs, Index: 7}, -1, cs},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := tc.k.ExpectedLen(tc.objectSize)
+			if got != tc.want {
+				t.Errorf("ExpectedLen=%d want %d", got, tc.want)
+			}
+		})
+	}
+}
+
+// TestKey_Path_Deterministic verifies that the same inputs always
+// produce the same path and that meaningful input differences
+// (OriginID, Bucket, ObjectKey, ETag, ChunkSize, Index) produce
+// distinct paths. The path encoding is part of orca's design
+// contract: any change here invalidates previously cached chunks.
+func TestKey_Path_Deterministic(t *testing.T) {
+	t.Parallel()
+
+	base := Key{
+		OriginID:  "origin-a",
+		Bucket:    "bucket",
+		ObjectKey: "key",
+		ETag:      "etag1",
+		ChunkSize: 1024,
+		Index:     0,
+	}
+	// Same inputs -> same path. Compare two equally-constructed Keys
+	// (calling Path() on the same receiver tautologically passes).
+	dup := base
+	if base.Path() != dup.Path() {
+		t.Fatalf("Path() not deterministic for identical key")
+	}
+
+	other := base
+	otherPath := other.Path()
+
+	mutations := []struct {
+		name string
+		mut  func(k *Key)
+	}{
+		{"different origin", func(k *Key) { k.OriginID = "origin-b" }},
+		{"different bucket", func(k *Key) { k.Bucket = "other-bucket" }},
+		{"different key", func(k *Key) { k.ObjectKey = "other-key" }},
+		{"different etag", func(k *Key) { k.ETag = "etag2" }},
+		{"different chunk size", func(k *Key) { k.ChunkSize = 2048 }},
+		{"different index", func(k *Key) { k.Index = 1 }},
+	}
+
+	for _, m := range mutations {
+		t.Run(m.name, func(t *testing.T) {
+			mutated := base
+			m.mut(&mutated)
+
+			got := mutated.Path()
+			if got == otherPath {
+				t.Errorf("path collision after %s mutation: %q", m.name, got)
+			}
+		})
+	}
+}
+
+// TestKey_Path_Format asserts the documented path shape:
+// "<origin_id>/<hex(sha256)>/<chunk_index>".
+func TestKey_Path_Format(t *testing.T) {
+	t.Parallel()
+
+	k := Key{
+		OriginID:  "origin-a",
+		Bucket:    "b",
+		ObjectKey: "k",
+		ETag:      "e",
+		ChunkSize: 1024,
+		Index:     7,
+	}
+
+	path := k.Path()
+
+	parts := strings.Split(path, "/")
+	if len(parts) != 3 {
+		t.Fatalf("path %q has %d segments, want 3", path, len(parts))
+	}
+
+	if parts[0] != "origin-a" {
+		t.Errorf("origin segment=%q want %q", parts[0], "origin-a")
+	}
+
+	if len(parts[1]) != 64 {
+		t.Errorf("hex segment len=%d want 64 (sha256)", len(parts[1]))
+	}
+
+	for _, c := range parts[1] {
+		isDigit := c >= '0' && c <= '9'
+		isLowerHex := c >= 'a' && c <= 'f'
+
+		if !isDigit && !isLowerHex {
+			t.Errorf("hex segment contains non-hex char %q", c)
+			break
+		}
+	}
+
+	if parts[2] != "7" {
+		t.Errorf("index segment=%q want %q", parts[2], "7")
+	}
+}
+
+// TestKey_Range verifies (off, length) = (Index*ChunkSize, ChunkSize).
+func TestKey_Range(t *testing.T) {
+	t.Parallel()
+
+	k := Key{ChunkSize: 1 << 20, Index: 3}
+
+	off, length := k.Range()
+	if off != 3<<20 {
+		t.Errorf("off=%d want %d", off, 3<<20)
+	}
+
+	if length != 1<<20 {
+		t.Errorf("length=%d want %d", length, 1<<20)
+	}
+}
+
+// TestIndexRange covers the chunk-index span computed from a byte
+// range plus the end clamping to objectSize.
+func TestIndexRange(t *testing.T) {
+	t.Parallel()
+
+	const chunkSize = int64(1024)
+
+	tests := []struct {
+		name       string
+		start, end int64
+		objectSize int64
+		wantFirst  int64
+		wantLast   int64
+	}{
+		{"aligned full chunk", 0, 1023, 1024, 0, 0},
+		{"aligned two chunks", 0, 2047, 4096, 0, 1},
+		{"start mid-chunk, end mid-chunk same", 100, 500, 1024, 0, 0},
+		{"start mid-chunk, end mid-next-chunk", 100, 1500, 4096, 0, 1},
+		{"end clamped to objectSize", 0, 9999, 2048, 0, 1},
+		{"single byte", 5, 5, 1024, 0, 0},
+		{"last partial chunk", 1024, 1500, 1500, 1, 1},
+		// Empty-object guard: end = -1 (objectSize == 0). Without
+		// the negative-end clamp Go's integer division floors to 0
+		// but a subsequent negative-end could leak through other
+		// branches; defensive clamp here keeps last >= 0.
+		{"empty object end=-1 clamped to 0", 0, -1, 0, 0, 0},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			first, last := IndexRange(tt.start, tt.end, chunkSize, tt.objectSize)
+			if first != tt.wantFirst {
+				t.Errorf("first=%d want %d", first, tt.wantFirst)
+			}
+
+			if last != tt.wantLast {
+				t.Errorf("last=%d want %d", last, tt.wantLast)
+			}
+		})
+	}
+}
+
+// TestChunkSlice covers the (off, length) within a single chunk that
+// satisfies the original byte range. Critical for cross-chunk
+// streamSlice copies.
+func TestChunkSlice(t *testing.T) {
+	t.Parallel()
+
+	const chunkSize = int64(1024)
+
+	tests := []struct {
+		name       string
+		chunkIdx   int64
+		start      int64
+		end        int64
+		objectSize int64
+		wantOff    int64
+		wantLen    int64
+	}{
+		{"entirely within chunk 0", 0, 100, 199, 4096, 100, 100},
+		{"start at chunk 0 boundary", 0, 0, 99, 4096, 0, 100},
+		{"end at chunk 0 boundary", 0, 0, 1023, 4096, 0, 1024},
+		{"chunk 1, range covers full chunk", 1, 1024, 2047, 4096, 0, 1024},
+		{"chunk spans range start", 1, 500, 1500, 4096, 0, 477}, // [1024..1500]
+		{"chunk spans range end", 1, 1500, 2500, 4096, 476, 548},
+		{"last partial chunk", 3, 3000, 3500, 3500, 0, 428},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			off, length := ChunkSlice(tt.chunkIdx, chunkSize, tt.start, tt.end, tt.objectSize)
+			if off != tt.wantOff {
+				t.Errorf("off=%d want %d", off, tt.wantOff)
+			}
+
+			if length != tt.wantLen {
+				t.Errorf("length=%d want %d", length, tt.wantLen)
+			}
+		})
+	}
+}
+
+// TestSizeFor covers the chunk-size tier ladder: base for objects
+// below the first threshold (or unknown sizes), tier ChunkSize for
+// objects at or above the corresponding MinObjectSize, and
+// last-tier-wins resolution when multiple tiers match.
+func TestSizeFor(t *testing.T) {
+	t.Parallel()
+
+	const (
+		base = int64(8 * 1024 * 1024)         // 8 MiB
+		t1   = int64(64 * 1024 * 1024)        // 64 MiB
+		t2   = int64(128 * 1024 * 1024)       // 128 MiB
+		oneG = int64(1024 * 1024 * 1024)      // 1 GiB
+		tenG = int64(10 * 1024 * 1024 * 1024) // 10 GiB
+	)
+
+	defaultTiers := []Tier{
+		{MinObjectSize: oneG, ChunkSize: t1},
+		{MinObjectSize: tenG, ChunkSize: t2},
+	}
+
+	tests := []struct {
+		name       string
+		objectSize int64
+		base       int64
+		tiers      []Tier
+		want       int64
+	}{
+		{
+			name:       "empty tiers returns base",
+			objectSize: 100 << 20,
+			base:       base,
+			tiers:      nil,
+			want:       base,
+		},
+		{
+			name:       "object below first threshold returns base",
+			objectSize: 512 << 20,
+			base:       base,
+			tiers:      defaultTiers,
+			want:       base,
+		},
+		{
+			name:       "object exactly at first threshold uses first tier",
+			objectSize: oneG,
+			base:       base,
+			tiers:      defaultTiers,
+			want:       t1,
+		},
+		{
+			name:       "object between tiers uses lower tier",
+			objectSize: oneG + (1 << 20),
+			base:       base,
+			tiers:      defaultTiers,
+			want:       t1,
+		},
+		{
+			name:       "object exactly at second threshold uses second tier",
+			objectSize: tenG,
+			base:       base,
+			tiers:      defaultTiers,
+			want:       t2,
+		},
+		{
+			name:       "huge object uses highest tier",
+			objectSize: 700 * 1024 * 1024 * 1024,
+			base:       base,
+			tiers:      defaultTiers,
+			want:       t2,
+		},
+		{
+			name:       "zero objectSize (unknown) returns base",
+			objectSize: 0,
+			base:       base,
+			tiers:      defaultTiers,
+			want:       base,
+		},
+		{
+			name:       "negative objectSize returns base",
+			objectSize: -1,
+			base:       base,
+			tiers:      defaultTiers,
+			want:       base,
+		},
+		{
+			name:       "single tier above object",
+			objectSize: 500 << 20,
+			base:       base,
+			tiers:      []Tier{{MinObjectSize: oneG, ChunkSize: t1}},
+			want:       base,
+		},
+		{
+			name:       "single tier at object",
+			objectSize: oneG,
+			base:       base,
+			tiers:      []Tier{{MinObjectSize: oneG, ChunkSize: t1}},
+			want:       t1,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := SizeFor(tt.objectSize, tt.base, tt.tiers)
+			if got != tt.want {
+				t.Errorf("SizeFor(%d, %d, %v)=%d want %d",
+					tt.objectSize, tt.base, tt.tiers, got, tt.want)
+			}
+		})
+	}
+}
+
+// TestKey_String covers both formatting branches (short ETag + long
+// ETag).
+func TestKey_String(t *testing.T) {
+	t.Parallel()
+
+	short := Key{
+		OriginID:  "o",
+		Bucket:    "b",
+		ObjectKey: "k",
+		ETag:      "abc",
+		Index:     5,
+	}
+	if s := short.String(); !strings.Contains(s, "@5") {
+		t.Errorf("short ETag string=%q does not contain @5", s)
+	}
+
+	long := Key{
+		OriginID:  "o",
+		Bucket:    "b",
+		ObjectKey: "k",
+		ETag:      "abcdefghi", // 9 chars > 8
+		Index:     5,
+	}
+
+	s := long.String()
+	if !strings.Contains(s, "..@") {
+		t.Errorf("long ETag string=%q does not contain truncation marker '..@'", s)
+	}
+
+	if !strings.Contains(s, "#9") {
+		t.Errorf("long ETag string=%q does not contain length suffix '#9'", s)
+	}
+}
diff --git a/internal/orca/chunkcatalog/chunkcatalog.go b/internal/orca/chunkcatalog/chunkcatalog.go
new file mode 100644
index 00000000..8b80c0bd
--- /dev/null
+++ b/internal/orca/chunkcatalog/chunkcatalog.go
@@ -0,0 +1,168 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package chunkcatalog implements a bounded LRU recording chunks known
+// to be present in the CacheStore. Pure hot-path optimization;
+// CacheStore is the source of truth.
+//
+// The catalog is presence-only: it tracks whether a chunk's path is
+// known to exist in the cachestore. No size or metadata is stored.
+// chunk.Path encodes (origin_id, bucket, key, etag, chunk_size), so
+// a path hit means the cachestore contains bytes for this exact
+// version of this chunk - the path encoding IS the integrity
+// statement, and a stale entry whose backing bytes have been deleted
+// is self-healing (cachestore.GetChunk returns ErrNotFound, caller
+// Forget()s the entry and falls through to the stat path).
+package chunkcatalog
+
+import (
+	"container/list"
+	"context"
+	"log/slog"
+	"sync"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+)
+
+// Catalog is a bounded LRU keyed on chunk.Key.Path().
+type Catalog struct {
+	mu         sync.Mutex
+	maxEntries int
+	ll         *list.List
+	idx        map[string]*list.Element
+	log        *slog.Logger
+}
+
+type entry struct {
+	path string
+}
+
+// New constructs a Catalog. The log is used at debug level for
+// per-call hit / miss / record / forget / evict trace lines via
+// slog.LogAttrs so the cost when filtered out (operator runs at
+// info or higher) is just the handler's level check. Passing nil
+// falls back to slog.Default().
+func New(maxEntries int, log *slog.Logger) *Catalog {
+	if maxEntries <= 0 {
+		maxEntries = 100_000
+	}
+
+	if log == nil {
+		log = slog.Default()
+	}
+
+	return &Catalog{
+		maxEntries: maxEntries,
+		ll:         list.New(),
+		idx:        make(map[string]*list.Element, maxEntries),
+		log:        log,
+	}
+}
+
+// Lookup reports whether the chunk is known to be present in the
+// cachestore. Bumps the LRU position on hit.
+//
+// This is the hottest log site in orca: it fires on every chunk read
+// attempt. The LogAttrs path ensures attribute-evaluation cost is
+// zero when the configured level is above Debug.
+func (c *Catalog) Lookup(k chunk.Key) bool {
+	path := k.Path()
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	el, ok := c.idx[path]
+	if !ok {
+		c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_lookup_miss",
+			catalogAttrs(k),
+		)
+
+		return false
+	}
+
+	c.ll.MoveToFront(el)
+
+	c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_lookup_hit",
+		catalogAttrs(k),
+	)
+
+	return true
+}
+
+// Record marks the chunk as present.
+//
+// The catalog is presence-only: callers do not pass (and the catalog
+// does not store) any size or freshness metadata. chunk.Path encodes
+// (origin_id, bucket, key, etag, chunk_size), so a Recorded key is
+// sufficient to know which exact version is in the cachestore. See
+// the package docstring for the rationale.
+func (c *Catalog) Record(k chunk.Key) {
+	path := k.Path()
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if el, ok := c.idx[path]; ok {
+		c.ll.MoveToFront(el)
+
+		c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_record_update",
+			catalogAttrs(k),
+		)
+
+		return
+	}
+
+	el := c.ll.PushFront(&entry{path: path})
+
+	c.idx[path] = el
+
+	c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_record_insert",
+		catalogAttrs(k),
+	)
+
+	for c.ll.Len() > c.maxEntries {
+		oldest := c.ll.Back()
+		if oldest == nil {
+			break
+		}
+
+		c.ll.Remove(oldest)
+
+		oldEntry := oldest.Value.(*entry) //nolint:errcheck // type invariant: list elements are *entry
+		delete(c.idx, oldEntry.path)
+
+		c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_evict",
+			slog.String("evicted_path", oldEntry.path),
+			slog.Int("lru_len", c.ll.Len()),
+		)
+	}
+}
+
+// Forget removes the entry if present.
+func (c *Catalog) Forget(k chunk.Key) {
+	path := k.Path()
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if el, ok := c.idx[path]; ok {
+		c.ll.Remove(el)
+		delete(c.idx, path)
+		c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_forget",
+			catalogAttrs(k),
+		)
+	}
+}
+
+// catalogAttrs renders the chunk's identifying tuple as a slog
+// group attribute, matching the 'chunk' taxonomy used by
+// fetch.Coordinator emissions so operator queries can grep on a
+// single consistent attribute path across packages.
+func catalogAttrs(k chunk.Key) slog.Attr {
+	return slog.Group("chunk",
+		slog.String("origin_id", k.OriginID),
+		slog.String("bucket", k.Bucket),
+		slog.String("key", k.ObjectKey),
+		slog.Int64("index", k.Index),
+	)
+}
diff --git a/internal/orca/chunkcatalog/chunkcatalog_test.go b/internal/orca/chunkcatalog/chunkcatalog_test.go
new file mode 100644
index 00000000..ea66893b
--- /dev/null
+++ b/internal/orca/chunkcatalog/chunkcatalog_test.go
@@ -0,0 +1,141 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package chunkcatalog
+
+import (
+	"bytes"
+	"io"
+	"log/slog"
+	"strings"
+	"testing"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+)
+
+// TestNew_UsesInjectedLogger locks the contract that the catalog
+// stores the caller's logger rather than slog.Default.
+func TestNew_UsesInjectedLogger(t *testing.T) {
+	t.Parallel()
+
+	injected := slog.New(slog.NewTextHandler(io.Discard, nil))
+	c := New(16, injected)
+
+	if c.log != injected {
+		t.Errorf("Catalog.log not the injected logger")
+	}
+}
+
+// TestNew_NilLoggerFallsBackToDefault verifies the nil-logger
+// fallback so misconfigured callers do not panic on the first
+// trace emission.
+func TestNew_NilLoggerFallsBackToDefault(t *testing.T) {
+	t.Parallel()
+
+	c := New(16, nil)
+	if c.log == nil {
+		t.Errorf("nil logger should have fallen back to slog.Default()")
+	}
+}
+
+// TestRecord_Lookup_Forget exercises the basic LRU operations
+// against the presence-only API.
+func TestRecord_Lookup_Forget(t *testing.T) {
+	t.Parallel()
+
+	c := New(16, nil)
+
+	k := chunk.Key{OriginID: "o", Bucket: "b", ObjectKey: "key", ChunkSize: 1024}
+	if c.Lookup(k) {
+		t.Fatalf("lookup before record returned hit")
+	}
+
+	c.Record(k)
+
+	if !c.Lookup(k) {
+		t.Errorf("lookup after record returned miss")
+	}
+
+	c.Forget(k)
+
+	if c.Lookup(k) {
+		t.Errorf("lookup after forget returned hit")
+	}
+}
+
+// TestDebugEmissions verifies the catalog emits the standardized
+// 'chunk' attribute group at debug level on the four operation
+// classes (lookup hit, lookup miss, record insert, forget) and that
+// the messages route through the injected logger.
+func TestDebugEmissions(t *testing.T) {
+	t.Parallel()
+
+	var buf bytes.Buffer
+
+	log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}))
+	c := New(16, log)
+
+	k := chunk.Key{OriginID: "ox", Bucket: "bkt", ObjectKey: "obj", ChunkSize: 1024, Index: 4}
+
+	c.Lookup(k) // miss
+	c.Record(k)
+	c.Lookup(k) // hit
+	c.Forget(k)
+
+	out := buf.String()
+	for _, want := range []string{
+		"chunkcatalog_lookup_miss",
+		"chunkcatalog_record_insert",
+		"chunkcatalog_lookup_hit",
+		"chunkcatalog_forget",
+		"chunk.index=4",
+		"chunk.key=obj",
+	} {
+		if !strings.Contains(out, want) {
+			t.Errorf("expected %q in debug output; got %q", want, out)
+		}
+	}
+}
+
+// TestDebugFilteredAtInfo verifies the catalog emits nothing when
+// the handler is configured above Debug, so the hot-path overhead
+// at production levels is just the handler's level check.
+func TestDebugFilteredAtInfo(t *testing.T) {
+	t.Parallel()
+
+	var buf bytes.Buffer
+
+	log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelInfo}))
+	c := New(16, log)
+
+	k := chunk.Key{OriginID: "ox", Bucket: "b", ObjectKey: "o", ChunkSize: 1024}
+	c.Record(k)
+	c.Lookup(k)
+	c.Forget(k)
+
+	if buf.Len() != 0 {
+		t.Errorf("debug emission leaked through Info-level handler: %q", buf.String())
+	}
+}
+
+// TestEvictEmitsAttr ensures the LRU-eviction debug emission fires
+// when capacity is exceeded. Capacity 1 plus two distinct inserts
+// forces an eviction observable via the evicted_path attribute.
+func TestEvictEmitsAttr(t *testing.T) {
+	t.Parallel()
+
+	var buf bytes.Buffer
+
+	log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}))
+	c := New(1, log)
+
+	k1 := chunk.Key{OriginID: "o", Bucket: "b", ObjectKey: "a", ChunkSize: 1024}
+	k2 := chunk.Key{OriginID: "o", Bucket: "b", ObjectKey: "b", ChunkSize: 1024}
+
+	c.Record(k1)
+	c.Record(k2)
+
+	if !strings.Contains(buf.String(), "chunkcatalog_evict") {
+		t.Errorf("evict emission missing from output: %q", buf.String())
+	}
+}
diff --git a/internal/orca/cluster/cluster.go b/internal/orca/cluster/cluster.go
new file mode 100644
index 00000000..a4f240c0
--- /dev/null
+++ b/internal/orca/cluster/cluster.go
@@ -0,0 +1,718 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package cluster handles peer discovery and rendezvous-hash
+// coordinator selection.
+//
+// Peer discovery: the headless Kubernetes Service backing the Orca
+// Deployment publishes Pod IPs in its A-record. We poll DNS at
+// cluster.membership_refresh interval (default 5s) and snapshot the
+// peer set.
+//
+// Coordinator selection: rendezvous hashing on (peer_ip, ChunkKey)
+// picks one coordinator per chunk across the cluster.
+//
+// Internal RPC: each replica runs an HTTP/2 client to dial peers'
+// internal listeners (mTLS in production, plain in dev). The
+// listener side is in the server/internal handler.
+//
+// # Test seams
+//
+// Production constructs a DNS-backed PeerSource implicitly from
+// cfg.Cluster.Service + net.DefaultResolver. Tests substitute the
+// entire mechanism with WithPeerSource (typically a mutable
+// StaticPeerSource per replica).
+package cluster
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"net"
+	"net/http"
+	"net/url"
+	"strconv"
+	"sync/atomic"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/config"
+)
+
+// Peer represents one replica in the current peer-set snapshot.
+//
+// In production every Peer has Port == 0 because pod IPs are
+// addressed on the same internal-listener port across the
+// Deployment. Integration tests with multiple replicas sharing
+// 127.0.0.1 set Port to the per-replica OS-assigned port; in that
+// mode FillFromPeer dials peer.IP:peer.Port instead of falling back
+// to cfg.Cluster.InternalListen's port.
+type Peer struct {
+	IP   string
+	Port int  // 0 = use cfg.Cluster.InternalListen's port (production)
+	Self bool // true when this Peer entry represents the local replica
+}
+
+// Cluster manages peer discovery, rendezvous hashing, and the
+// internal-RPC client.
+type Cluster struct {
+	cfg config.Cluster
+	log *slog.Logger
+
+	peers atomic.Pointer[[]Peer]
+
+	httpClient *http.Client
+	source     PeerSource
+
+	// consecutiveRefreshErrors counts adjacent failed refresh attempts.
+	// Reset on any successful refresh. When the count exceeds
+	// maxStalePeerRefreshes the retained-previous fallback gives up
+	// and reverts to a self-only peer set.
+	consecutiveRefreshErrors atomic.Int64
+
+	cancelFn context.CancelFunc
+	done     chan struct{}
+}
+
+// maxStalePeerRefreshes is the number of consecutive refresh failures
+// after which Cluster.refresh stops retaining the previous peer-set
+// snapshot and falls back to [Self]. Bounds how long we route to
+// dead peers if peer discovery is permanently broken.
+const maxStalePeerRefreshes = 5
+
+// resolver looks up the host names that back the headless Service.
+// Production uses net.DefaultResolver. The interface is
+// package-internal: production code does not customize it, and the
+// DNS-backed peer source is the only implementation.
+type resolver interface {
+	LookupHost(ctx context.Context, host string) ([]string, error)
+}
+
+// PeerSource produces the current peer-set snapshot. The DNS-backed
+// implementation queries the headless Service's A-record. Tests
+// substitute a StaticPeerSource that returns a mutable list of peers
+// with explicit Port values (so multiple replicas can share an IP).
+//
+// Each returned Peer.Self must be authoritatively set by the source
+// (the source knows the calling replica's identity at construction
+// time, so it is the only place that can stamp Self correctly when
+// peers share an IP).
+type PeerSource interface {
+	Peers(ctx context.Context) ([]Peer, error)
+}
+
+// Option configures a Cluster at construction time.
+type Option func(*Cluster)
+
+// WithPeerSource replaces the entire peer-discovery mechanism. This
+// is the primary test seam; production code constructs the default
+// DNS-backed source implicitly from cfg.Cluster.Service.
+func WithPeerSource(s PeerSource) Option {
+	return func(c *Cluster) { c.source = s }
+}
+
+// WithHTTPClient overrides the internal-RPC HTTP client. TEST-ONLY:
+// production constructs the default client from cfg via newHTTPClient.
+// Used by unit tests that need to inject a client with custom timeouts
+// or transport behaviour for deterministic deadline coverage.
+func WithHTTPClient(c *http.Client) Option {
+	return func(cl *Cluster) { cl.httpClient = c }
+}
+
+// WithLogger overrides the cluster's structured logger. The default
+// is slog.Default(). The logger receives debug-level emissions for
+// every refresh cycle, coordinator selection, and FillFromPeer call,
+// plus warn-level emissions for retained-previous-snapshot fallback.
+func WithLogger(log *slog.Logger) Option {
+	return func(cl *Cluster) { cl.log = log }
+}
+
+func newDNSPeerSource(service, selfIP string, r resolver) PeerSource {
+	if r == nil {
+		r = net.DefaultResolver
+	}
+
+	return &dnsPeerSource{
+		service:  service,
+		selfIP:   selfIP,
+		resolver: r,
+	}
+}
+
+type dnsPeerSource struct {
+	service  string
+	selfIP   string
+	resolver resolver
+}
+
+func (s *dnsPeerSource) Peers(ctx context.Context) ([]Peer, error) {
+	rctx, cancel := context.WithTimeout(ctx, 3*time.Second)
+	defer cancel()
+
+	ips, err := s.resolver.LookupHost(rctx, s.service)
+	if err != nil {
+		return nil, err
+	}
+
+	peers := make([]Peer, 0, len(ips))
+	for _, ip := range ips {
+		peers = append(peers, Peer{IP: ip, Self: ip == s.selfIP})
+	}
+
+	return peers, nil
+}
+
+// New returns a Cluster and starts the membership-refresh goroutine.
+func New(parent context.Context, cfg config.Cluster, opts ...Option) (*Cluster, error) {
+	if cfg.Service == "" {
+		return nil, fmt.Errorf("cluster: service required (headless Service FQDN)")
+	}
+
+	if cfg.SelfPodIP == "" {
+		return nil, fmt.Errorf("cluster: self_pod_ip required (set POD_IP env)")
+	}
+
+	ctx, cancel := context.WithCancel(parent)
+
+	httpClient, err := newHTTPClient(cfg)
+	if err != nil {
+		cancel()
+		return nil, err
+	}
+
+	c := &Cluster{
+		cfg:        cfg,
+		log:        slog.Default(),
+		httpClient: httpClient,
+		source:     newDNSPeerSource(cfg.Service, cfg.SelfPodIP, nil),
+		cancelFn:   cancel,
+		done:       make(chan struct{}),
+	}
+
+	for _, opt := range opts {
+		opt(c)
+	}
+
+	if c.log == nil {
+		c.log = slog.Default()
+	}
+	// Initial refresh; failure is non-fatal (empty peer-set fallback).
+	c.refresh(ctx)
+
+	go c.refreshLoop(ctx)
+
+	return c, nil
+}
+
+// Close stops the refresh goroutine and waits for it to exit. If ctx
+// is canceled before the goroutine exits (e.g. an in-flight DNS
+// lookup is taking longer than the caller can tolerate) Close returns
+// the context error. The underlying cancellation is always signalled,
+// so the goroutine will exit eventually even if the caller stops
+// waiting.
+func (c *Cluster) Close(ctx context.Context) error {
+	c.cancelFn()
+
+	select {
+	case <-c.done:
+		return nil
+	case <-ctx.Done():
+		return ctx.Err()
+	}
+}
+
+// Peers returns the current peer-set snapshot.
+func (c *Cluster) Peers() []Peer {
+	p := c.peers.Load()
+	if p == nil {
+		return []Peer{{IP: c.cfg.SelfPodIP, Self: true}}
+	}
+
+	return *p
+}
+
+// HasInitialSnapshot reports whether the cluster has loaded at least
+// one peer-set snapshot (success or failure path - any value stored
+// by refresh counts). Used by the app's /readyz endpoint to gate
+// readiness on cluster discovery having completed its initial pass.
+// Returns false only during the bootstrap window before refresh
+// runs even once.
+func (c *Cluster) HasInitialSnapshot() bool {
+	return c.peers.Load() != nil
+}
+
+// Coordinator selects the rendezvous-hashed coordinator for a chunk.
+//
+// Returns the Peer with the highest hash(peer || chunk_path) score.
+// Peers() always returns at least one entry (self, via the bootstrap
+// fallback in Peers and the never-empty post-condition of every
+// branch in refresh), so this function does not need to handle an
+// empty input.
+func (c *Cluster) Coordinator(k chunk.Key) Peer {
+	peers := c.Peers()
+
+	path := []byte(k.Path())
+
+	var (
+		best      Peer
+		bestScore uint64
+	)
+
+	for i, p := range peers {
+		score := rendezvousScore(p, path)
+		if i == 0 || score > bestScore {
+			bestScore = score
+			best = p
+		}
+	}
+
+	c.log.LogAttrs(context.Background(), slog.LevelDebug, "coordinator_selected",
+		slog.String("origin_id", k.OriginID),
+		slog.String("bucket", k.Bucket),
+		slog.String("key", k.ObjectKey),
+		slog.Int64("index", k.Index),
+		slog.String("chosen_ip", best.IP),
+		slog.Bool("is_self", best.Self),
+		slog.Uint64("score", bestScore),
+	)
+
+	return best
+}
+
+// IsCoordinator reports whether this replica is the coordinator for k.
+// Every code path producing a coord value stamps the Self flag
+// authoritatively (dnsPeerSource matches by selfIP; StaticPeerSource
+// by (selfIP, selfPort); the empty-peer-set fallback constructs
+// c.self()), so checking Self is the single source of truth.
+func (c *Cluster) IsCoordinator(k chunk.Key) bool {
+	return c.Coordinator(k).Self
+}
+
+// FillFromPeer issues GET /internal/fill against the named peer and
+// returns the streaming chunk body. Caller closes the returned
+// reader. objectSize is the authoritative size of the object the
+// chunk belongs to; it is forwarded to the peer so the leader can
+// compute the correct per-chunk length (especially for the tail
+// chunk) and set Content-Length on its response.
+func (c *Cluster) FillFromPeer(ctx context.Context, p Peer, k chunk.Key, objectSize int64) (io.ReadCloser, error) {
+	if p.Self {
+		return nil, fmt.Errorf("cluster: refusing to FillFromPeer for self")
+	}
+
+	scheme := "http"
+	if c.cfg.InternalTLS.Enabled {
+		scheme = "https"
+	}
+
+	port := strconv.Itoa(p.Port)
+	if p.Port == 0 {
+		_, defaultPort, err := net.SplitHostPort(c.cfg.InternalListen)
+		if err != nil {
+			defaultPort = "8444"
+		}
+
+		port = defaultPort
+	}
+
+	target := url.URL{
+		Scheme:   scheme,
+		Host:     net.JoinHostPort(p.IP, port),
+		Path:     "/internal/fill",
+		RawQuery: encodeChunkKey(k, objectSize),
+	}
+
+	c.log.LogAttrs(ctx, slog.LevelDebug, "fill_from_peer_request",
+		slog.String("peer_ip", p.IP),
+		slog.String("peer_port", port),
+		slog.String("origin_id", k.OriginID),
+		slog.String("bucket", k.Bucket),
+		slog.String("key", k.ObjectKey),
+		slog.Int64("index", k.Index),
+		slog.Int64("object_size", objectSize),
+	)
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, target.String(), nil)
+	if err != nil {
+		return nil, fmt.Errorf("cluster: build internal-fill request: %w", err)
+	}
+
+	req.Header.Set("X-Orca-Internal", "1")
+
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("cluster: internal-fill RPC: %w", err)
+	}
+
+	if resp.StatusCode == http.StatusConflict {
+		_ = resp.Body.Close() //nolint:errcheck // best-effort close on error path
+
+		c.log.LogAttrs(ctx, slog.LevelDebug, "fill_from_peer_not_coordinator",
+			slog.String("peer_ip", p.IP),
+			slog.String("origin_id", k.OriginID),
+			slog.Int64("index", k.Index),
+		)
+
+		return nil, ErrPeerNotCoordinator
+	}
+
+	if resp.StatusCode/100 != 2 {
+		body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) //nolint:errcheck // best-effort error body read
+		_ = resp.Body.Close()                                  //nolint:errcheck // best-effort close on error path
+
+		return nil, fmt.Errorf("cluster: internal-fill RPC returned %d: %s",
+			resp.StatusCode, string(body))
+	}
+
+	c.log.LogAttrs(ctx, slog.LevelDebug, "fill_from_peer_response",
+		slog.String("peer_ip", p.IP),
+		slog.Int("status", resp.StatusCode),
+		slog.Int64("content_length", resp.ContentLength),
+	)
+
+	// Wrap the response body in a defense-in-depth validator that
+	// ensures the peer delivered exactly Content-Length bytes.
+	// net/http already raises io.ErrUnexpectedEOF when the body
+	// closes short of an explicit Content-Length, but the wrapper
+	// makes that contract explicit at the call site (so readers of
+	// FillFromPeer do not need to reason about transport internals)
+	// and guards against future changes to net/http's behavior.
+	if resp.ContentLength > 0 {
+		return &validatingReader{
+			rc:       resp.Body,
+			expected: resp.ContentLength,
+		}, nil
+	}
+
+	return resp.Body, nil
+}
+
+// validatingReader wraps an io.ReadCloser and returns
+// io.ErrUnexpectedEOF if the underlying stream closes after fewer
+// than expected bytes. Used by FillFromPeer to detect truncated
+// cross-replica internal-fill responses.
+type validatingReader struct {
+	rc       io.ReadCloser
+	expected int64
+	got      int64
+}
+
+func (r *validatingReader) Read(p []byte) (int, error) {
+	n, err := r.rc.Read(p)
+	r.got += int64(n)
+
+	if errors.Is(err, io.EOF) && r.got != r.expected {
+		return n, fmt.Errorf("cluster: internal-fill truncated: got %d bytes, expected %d: %w",
+			r.got, r.expected, io.ErrUnexpectedEOF)
+	}
+
+	return n, err
+}
+
+func (r *validatingReader) Close() error { return r.rc.Close() }
+
+// ErrPeerNotCoordinator is returned by FillFromPeer when the peer
+// reports it is not the coordinator (membership disagreement).
+var ErrPeerNotCoordinator = fmt.Errorf("cluster: peer is not the coordinator (409 Conflict)")
+
+func (c *Cluster) refreshLoop(ctx context.Context) {
+	defer close(c.done)
+
+	t := time.NewTicker(c.cfg.MembershipRefresh)
+	defer t.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			c.refresh(ctx)
+		}
+	}
+}
+
+func (c *Cluster) refresh(ctx context.Context) {
+	peers, err := c.source.Peers(ctx)
+	if err != nil {
+		// A cancelled parent ctx (process shutdown) is not a
+		// discovery failure: it means the refresh loop is exiting.
+		// Bumping the streak counter on the way out would push the
+		// final snapshot into the self-only fallback path and emit
+		// a noisy 'discovery failed' warning during normal
+		// shutdown.
+		if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+			return
+		}
+		// Discovery failed. Retain the previous snapshot if we have
+		// one and we have not exceeded the staleness ceiling; the
+		// internal-fill RPC fallback (cluster.ErrPeerNotCoordinator
+		// -> local fill in fetch.Coordinator.GetChunk) absorbs
+		// pointing at briefly-stale peers. On bootstrap (no previous
+		// snapshot) or after too many consecutive errors, fall back
+		// to a self-only peer set so we keep making forward progress.
+		streak := c.consecutiveRefreshErrors.Add(1)
+
+		if c.peers.Load() != nil && streak <= maxStalePeerRefreshes {
+			c.log.LogAttrs(ctx, slog.LevelWarn, "cluster: peer discovery failed; retaining previous snapshot",
+				slog.Any("err", err),
+				slog.Int64("consecutive_errors", streak),
+			)
+
+			return
+		}
+
+		self := []Peer{{IP: c.cfg.SelfPodIP, Self: true}}
+		c.storePeerSet(ctx, self, "self_only_fallback")
+
+		return
+	}
+
+	c.consecutiveRefreshErrors.Store(0)
+
+	if len(peers) == 0 {
+		// DNS legitimately reports no peers (e.g. headless Service
+		// has no Ready pods other than maybe self). Apply self-only
+		// fallback.
+		self := []Peer{{IP: c.cfg.SelfPodIP, Self: true}}
+		c.storePeerSet(ctx, self, "empty_discovery_self_only")
+
+		return
+	}
+	// Ensure self is always in the set even if discovery hasn't
+	// caught up yet.
+	hasSelf := false
+
+	for _, p := range peers {
+		if p.Self {
+			hasSelf = true
+			break
+		}
+	}
+
+	if !hasSelf {
+		peers = append(peers, Peer{IP: c.cfg.SelfPodIP, Self: true})
+	}
+
+	c.storePeerSet(ctx, peers, "discovery_ok")
+}
+
+// storePeerSet atomically swaps in a fresh peer-set snapshot and
+// emits trace lines describing the transition. A per-cycle debug
+// emission fires unconditionally; an info-level 'peer_set_changed'
+// emission fires only when the rendered set differs from the
+// previously stored snapshot. The reason argument tags the source
+// of the new snapshot for diagnostic clarity.
+func (c *Cluster) storePeerSet(ctx context.Context, peers []Peer, reason string) {
+	prev := c.peers.Load()
+	c.peers.Store(&peers)
+
+	c.log.LogAttrs(ctx, slog.LevelDebug, "peer_set_refreshed",
+		slog.String("reason", reason),
+		slog.Int("count", len(peers)),
+	)
+
+	if prev == nil {
+		// First snapshot: log it at info so operators see the
+		// bootstrap transition.
+		c.log.LogAttrs(ctx, slog.LevelInfo, "peer_set_initial",
+			slog.String("reason", reason),
+			slog.Int("count", len(peers)),
+		)
+
+		return
+	}
+
+	added, removed := diffPeers(*prev, peers)
+	if len(added) == 0 && len(removed) == 0 {
+		return
+	}
+
+	c.log.LogAttrs(ctx, slog.LevelInfo, "peer_set_changed",
+		slog.String("reason", reason),
+		slog.Int("count", len(peers)),
+		slog.Any("added", added),
+		slog.Any("removed", removed),
+	)
+}
+
+// diffPeers returns the IP+Port lists added and removed between the
+// previous and next snapshots. Self flag is ignored for diff purposes
+// because membership identity is the (ip, port) tuple; the same peer
+// flipping Self is a no-op for membership transitions.
+func diffPeers(prev, next []Peer) (added, removed []string) {
+	seen := make(map[string]bool, len(prev))
+	for _, p := range prev {
+		seen[peerKey(p)] = true
+	}
+
+	nextSet := make(map[string]bool, len(next))
+	for _, p := range next {
+		nextSet[peerKey(p)] = true
+
+		if !seen[peerKey(p)] {
+			added = append(added, peerKey(p))
+		}
+	}
+
+	for _, p := range prev {
+		if !nextSet[peerKey(p)] {
+			removed = append(removed, peerKey(p))
+		}
+	}
+
+	return added, removed
+}
+
+func peerKey(p Peer) string {
+	if p.Port == 0 {
+		return p.IP
+	}
+
+	return fmt.Sprintf("%s:%d", p.IP, p.Port)
+}
+
+func newHTTPClient(cfg config.Cluster) (*http.Client, error) {
+	// Guard: internal TLS configuration is not yet wired through to
+	// the transport. Refusing to start when cfg.InternalTLS.Enabled
+	// is true prevents a silent security downgrade in which the
+	// client would dial https:// against the system trust store
+	// instead of the configured CA / client cert. The production
+	// path (load CAFile + optional client cert/key into
+	// tr.TLSClientConfig) is not implemented; this guard must be
+	// removed in tandem with that work.
+	if cfg.InternalTLS.Enabled {
+		return nil, fmt.Errorf("cluster: internal TLS requested (cluster.internal_tls.enabled=true) but not yet implemented; refusing to start")
+	}
+
+	// DialContext bounds connect-level latency independently of the
+	// caller's ctx. Without this, a stuck TCP SYN against a half-
+	// failed peer would hang until the caller's deadline (which can
+	// be the full 5-minute fill ctx for leader-side fills). 10s is
+	// generous for in-DC latency and short enough that a failed-fast
+	// peer fallback is visible.
+	dialer := &net.Dialer{
+		Timeout:   10 * time.Second,
+		KeepAlive: 30 * time.Second,
+	}
+
+	tr := &http.Transport{
+		DialContext:         dialer.DialContext,
+		MaxIdleConns:        16,
+		MaxIdleConnsPerHost: 4,
+		IdleConnTimeout:     30 * time.Second,
+		// TLSHandshakeTimeout bounds the handshake separately from
+		// the request ctx so a malicious / misconfigured peer cannot
+		// hold a half-open TLS connection past the dial timeout.
+		TLSHandshakeTimeout:   10 * time.Second,
+		ExpectContinueTimeout: 1 * time.Second,
+		ForceAttemptHTTP2:     true,
+	}
+
+	// No http.Client.Timeout: it is the request-total wall clock and
+	// would clamp long-running internal-fill body streams (an 8 MiB
+	// chunk on a degraded inter-pod link can exceed 60s). The caller's
+	// ctx (an edge request ctx for client-driven fills, the 5-minute
+	// detached fill ctx in fetch.runFill for leader-side ones) is the
+	// body-read deadline; the Transport-level Dial / TLS handshake
+	// timeouts above bound the connection-establishment surface
+	// independently.
+	return &http.Client{
+		Transport: tr,
+	}, nil
+}
+
+// Score returns the rendezvous-hash score for (peer, key). Exposed so
+// integration tests can craft phantom peers that deterministically
+// win or lose against a real peer for a given key (used to induce
+// membership disagreement scenarios).
+func Score(p Peer, key []byte) uint64 {
+	return rendezvousScore(p, key)
+}
+
+func rendezvousScore(p Peer, key []byte) uint64 {
+	h := sha256.New()
+	h.Write([]byte(p.IP))
+	h.Write([]byte{0})
+
+	if p.Port != 0 {
+		// In production every peer has Port=0 so this branch never
+		// fires and the score is identical to historical behavior
+		// (sha256(ip || 0 || key)). Tests with multiple peers sharing
+		// 127.0.0.1 set distinct Ports so the score differentiates
+		// replicas.
+		var pb [4]byte
+		binary.BigEndian.PutUint32(pb[:], uint32(p.Port))
+		h.Write(pb[:])
+		h.Write([]byte{0})
+	}
+
+	h.Write(key)
+	sum := h.Sum(nil)
+
+	return binary.BigEndian.Uint64(sum[:8])
+}
+
+func encodeChunkKey(k chunk.Key, objectSize int64) string {
+	v := url.Values{}
+	v.Set("origin_id", k.OriginID)
+	v.Set("bucket", k.Bucket)
+	v.Set("key", k.ObjectKey)
+	v.Set("etag", k.ETag)
+	v.Set("chunk_size", strconv.FormatInt(k.ChunkSize, 10))
+	v.Set("index", strconv.FormatInt(k.Index, 10))
+	v.Set("object_size", strconv.FormatInt(objectSize, 10))
+
+	return v.Encode()
+}
+
+// DecodeChunkKey parses query params into a Key plus the authoritative
+// object size. Used by the internal listener (server/internal/fill).
+func DecodeChunkKey(values url.Values) (chunk.Key, int64, error) {
+	chunkSize, err := strconv.ParseInt(values.Get("chunk_size"), 10, 64)
+	if err != nil {
+		return chunk.Key{}, 0, fmt.Errorf("invalid chunk_size: %w", err)
+	}
+
+	if chunkSize <= 0 {
+		return chunk.Key{}, 0, fmt.Errorf("invalid chunk_size: must be > 0, got %d", chunkSize)
+	}
+
+	idx, err := strconv.ParseInt(values.Get("index"), 10, 64)
+	if err != nil {
+		return chunk.Key{}, 0, fmt.Errorf("invalid index: %w", err)
+	}
+
+	if idx < 0 {
+		return chunk.Key{}, 0, fmt.Errorf("invalid index: must be >= 0, got %d", idx)
+	}
+
+	objectSize, err := strconv.ParseInt(values.Get("object_size"), 10, 64)
+	if err != nil {
+		return chunk.Key{}, 0, fmt.Errorf("invalid object_size: %w", err)
+	}
+
+	if objectSize <= 0 {
+		return chunk.Key{}, 0, fmt.Errorf("invalid object_size: must be > 0, got %d", objectSize)
+	}
+
+	originID := values.Get("origin_id")
+	bucket := values.Get("bucket")
+	key := values.Get("key")
+	etag := values.Get("etag")
+
+	if originID == "" || key == "" {
+		return chunk.Key{}, 0, fmt.Errorf("missing required key fields")
+	}
+
+	return chunk.Key{
+		OriginID:  originID,
+		Bucket:    bucket,
+		ObjectKey: key,
+		ETag:      etag,
+		ChunkSize: chunkSize,
+		Index:     idx,
+	}, objectSize, nil
+}
diff --git a/internal/orca/cluster/cluster_test.go b/internal/orca/cluster/cluster_test.go
new file mode 100644
index 00000000..431e848a
--- /dev/null
+++ b/internal/orca/cluster/cluster_test.go
@@ -0,0 +1,763 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package cluster
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"io"
+	"log/slog"
+	"net"
+	"net/http"
+	"net/url"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/config"
+)
+
+// fakePeerSource implements PeerSource for unit tests.
+type fakePeerSource struct {
+	mu    func() ([]Peer, error)
+	calls atomic.Int64
+}
+
+func (f *fakePeerSource) Peers(_ context.Context) ([]Peer, error) {
+	f.calls.Add(1)
+
+	return f.mu()
+}
+
+// TestRefresh_RetainsPreviousOnError verifies that a discovery error
+// after a successful refresh retains the previous peer-set rather
+// than clobbering it with [Self].
+//
+// Regression test for B3.
+func TestRefresh_RetainsPreviousOnError(t *testing.T) {
+	t.Parallel()
+
+	good := []Peer{
+		{IP: "10.0.0.1", Self: false},
+		{IP: "10.0.0.2", Self: true},
+		{IP: "10.0.0.3", Self: false},
+	}
+
+	var failing atomic.Bool
+
+	src := &fakePeerSource{
+		mu: func() ([]Peer, error) {
+			if failing.Load() {
+				return nil, errors.New("transient DNS failure")
+			}
+
+			out := make([]Peer, len(good))
+			copy(out, good)
+
+			return out, nil
+		},
+	}
+
+	c, err := New(t.Context(),
+		config.Cluster{
+			Service:           "test",
+			SelfPodIP:         "10.0.0.2",
+			MembershipRefresh: time.Hour, // disable auto-refresh; we drive it manually
+		},
+		WithPeerSource(src),
+	)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+	// Initial refresh ran during New; verify good peers are loaded.
+	if got := len(c.Peers()); got != 3 {
+		t.Fatalf("initial Peers()=%d want 3", got)
+	}
+
+	failing.Store(true)
+	// First few error refreshes: retain previous snapshot.
+	for i := 0; i < maxStalePeerRefreshes; i++ {
+		c.refresh(t.Context())
+
+		if got := len(c.Peers()); got != 3 {
+			t.Errorf("after error %d: Peers()=%d want 3 (retain previous)", i+1, got)
+		}
+	}
+	// Next refresh exceeds the staleness ceiling -> fall back to self.
+	c.refresh(t.Context())
+
+	if got := c.Peers(); len(got) != 1 || !got[0].Self {
+		t.Errorf("after ceiling exceeded: Peers()=%+v want [Self]", got)
+	}
+	// Recovery: source returns good peers again. Error counter resets.
+	failing.Store(false)
+	c.refresh(t.Context())
+
+	if got := len(c.Peers()); got != 3 {
+		t.Errorf("after recovery: Peers()=%d want 3", got)
+	}
+
+	if got := c.consecutiveRefreshErrors.Load(); got != 0 {
+		t.Errorf("error counter not reset after success: got %d", got)
+	}
+}
+
+// TestRefresh_BootstrapErrorFallsBackToSelf verifies that on bootstrap
+// (no previous snapshot) a discovery error falls back to [Self]
+// immediately - we cannot retain something that does not exist.
+func TestRefresh_BootstrapErrorFallsBackToSelf(t *testing.T) {
+	t.Parallel()
+
+	src := &fakePeerSource{
+		mu: func() ([]Peer, error) {
+			return nil, errors.New("DNS not reachable yet")
+		},
+	}
+
+	c, err := New(t.Context(),
+		config.Cluster{
+			Service:           "test",
+			SelfPodIP:         "10.0.0.1",
+			MembershipRefresh: time.Hour,
+		},
+		WithPeerSource(src),
+	)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+	got := c.Peers()
+	if len(got) != 1 || !got[0].Self {
+		t.Errorf("bootstrap with error source: Peers()=%+v want [Self]", got)
+	}
+}
+
+// TestRefresh_EmptyResultFallsBackToSelf verifies that a successful
+// discovery returning zero peers (the legitimate "I'm alone" answer)
+// still falls back to [Self] without bumping the error counter.
+func TestRefresh_EmptyResultFallsBackToSelf(t *testing.T) {
+	t.Parallel()
+
+	src := &fakePeerSource{
+		mu: func() ([]Peer, error) {
+			return nil, nil // no error, zero peers
+		},
+	}
+
+	c, err := New(t.Context(),
+		config.Cluster{
+			Service:           "test",
+			SelfPodIP:         "10.0.0.1",
+			MembershipRefresh: time.Hour,
+		},
+		WithPeerSource(src),
+	)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+	got := c.Peers()
+	if len(got) != 1 || !got[0].Self {
+		t.Errorf("empty source: Peers()=%+v want [Self]", got)
+	}
+
+	if got := c.consecutiveRefreshErrors.Load(); got != 0 {
+		t.Errorf("empty (non-error) result should not bump error counter; got %d", got)
+	}
+}
+
+// TestFillFromPeer_DetectsTruncation verifies that the validating
+// reader returned by FillFromPeer surfaces io.ErrUnexpectedEOF when
+// the peer advertises a Content-Length but the connection closes
+// before that many bytes have been delivered. Without the validator
+// the requester would observe a clean io.EOF and silently pass
+// short bytes through to the client.
+//
+// Regression test for B7.
+func TestFillFromPeer_DetectsTruncation(t *testing.T) {
+	t.Parallel()
+
+	const advertised = 100
+
+	const delivered = 50
+
+	// Use a raw TCP listener so we have full control over the wire
+	// format: write Content-Length: 100, then write 50 body bytes,
+	// then close the connection mid-stream.
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("listen: %v", err)
+	}
+
+	t.Cleanup(func() { _ = ln.Close() }) //nolint:errcheck // test cleanup
+
+	go func() {
+		conn, err := ln.Accept()
+		if err != nil {
+			return
+		}
+
+		defer conn.Close() //nolint:errcheck // test cleanup
+		// Consume request headers up through the blank line.
+		buf := make([]byte, 4096)
+
+		if _, err := conn.Read(buf); err != nil {
+			return
+		}
+
+		resp := "HTTP/1.1 200 OK\r\n" +
+			"Content-Length: " + strconv.Itoa(advertised) + "\r\n" +
+			"Content-Type: application/octet-stream\r\n" +
+			"\r\n"
+		if _, err := conn.Write([]byte(resp)); err != nil {
+			return
+		}
+
+		if _, err := conn.Write(make([]byte, delivered)); err != nil {
+			return
+		}
+		// Close mid-body without writing the remaining bytes.
+	}()
+
+	host, portStr, err := net.SplitHostPort(ln.Addr().String())
+	if err != nil {
+		t.Fatalf("split host port: %v", err)
+	}
+
+	port, err := strconv.Atoi(portStr)
+	if err != nil {
+		t.Fatalf("parse port: %v", err)
+	}
+
+	c, err := New(t.Context(),
+		config.Cluster{
+			Service:           "test",
+			SelfPodIP:         "10.0.0.1",
+			MembershipRefresh: time.Hour,
+			InternalListen:    "0.0.0.0:8444",
+		},
+		WithPeerSource(&fakePeerSource{mu: func() ([]Peer, error) {
+			return []Peer{{IP: "10.0.0.1", Self: true}}, nil
+		}}),
+	)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+	peer := Peer{IP: host, Port: port}
+	key := chunk.Key{
+		OriginID:  "test-origin",
+		Bucket:    "test-bucket",
+		ObjectKey: "test-object",
+		ETag:      "test-etag",
+		ChunkSize: advertised,
+		Index:     0,
+	}
+
+	body, err := c.FillFromPeer(t.Context(), peer, key, advertised)
+	if err != nil {
+		t.Fatalf("FillFromPeer: %v", err)
+	}
+
+	defer body.Close() //nolint:errcheck // test cleanup
+
+	got, err := io.ReadAll(body)
+	if !errors.Is(err, io.ErrUnexpectedEOF) {
+		t.Errorf("expected io.ErrUnexpectedEOF, got err=%v (read %d bytes)", err, len(got))
+	}
+
+	if len(got) != delivered {
+		t.Errorf("got %d bytes, expected %d (the delivered prefix)", len(got), delivered)
+	}
+}
+
+// TestNewHTTPClient_NoWallTimeout asserts that the default
+// internal-RPC HTTP client carries no Client.Timeout. Client.Timeout
+// is a request-total wall clock that would clamp long-running fill
+// body streams (an 8 MiB chunk on a degraded inter-pod link can
+// exceed any reasonable hardcoded bound). The caller's ctx is the
+// sole deadline for body reads.
+func TestNewHTTPClient_NoWallTimeout(t *testing.T) {
+	t.Parallel()
+
+	c, err := newHTTPClient(config.Cluster{})
+	if err != nil {
+		t.Fatalf("newHTTPClient: %v", err)
+	}
+
+	if c.Timeout != 0 {
+		t.Errorf("internal-RPC http.Client.Timeout = %v, want 0", c.Timeout)
+	}
+}
+
+// TestNewHTTPClient_ConnectTimeouts asserts that the Transport
+// carries bounded connect-level timeouts independent of the
+// caller's ctx. Without these, a stuck TCP SYN or stalled TLS
+// handshake against a half-failed peer would hang until the
+// caller's deadline (which is the full 5-minute fill ctx for
+// leader-side fills, causing slot starvation).
+//
+// Regression for H-4.
+func TestNewHTTPClient_ConnectTimeouts(t *testing.T) {
+	t.Parallel()
+
+	c, err := newHTTPClient(config.Cluster{})
+	if err != nil {
+		t.Fatalf("newHTTPClient: %v", err)
+	}
+
+	tr, ok := c.Transport.(*http.Transport)
+	if !ok {
+		t.Fatalf("Transport is %T; want *http.Transport", c.Transport)
+	}
+
+	if tr.TLSHandshakeTimeout == 0 {
+		t.Errorf("TLSHandshakeTimeout is 0; want bounded")
+	}
+
+	if tr.DialContext == nil {
+		t.Errorf("DialContext is nil; expected bounded dialer")
+	}
+}
+
+// TestNewHTTPClient_InternalTLSEnabledRefusesToStart verifies that
+// newHTTPClient refuses to construct a client when
+// cfg.InternalTLS.Enabled=true. The TLS configuration is not yet
+// wired into the transport (no TLSClientConfig); returning a working
+// client in that case would silently dial https:// against the
+// system trust store instead of the configured CA, downgrading the
+// security posture. The constructor must fail loudly until the
+// production TLS wiring is implemented.
+func TestNewHTTPClient_InternalTLSEnabledRefusesToStart(t *testing.T) {
+	t.Parallel()
+
+	cfg := config.Cluster{
+		InternalTLS: config.InternalTLS{Enabled: true},
+	}
+
+	c, err := newHTTPClient(cfg)
+	if err == nil {
+		t.Fatalf("newHTTPClient with InternalTLS.Enabled=true returned client %v; want error", c)
+	}
+}
+
+// TestFillFromPeer_CtxDeadlineHonored verifies that the caller's ctx
+// deadline (rather than any hardcoded wall clock inside the cluster's
+// HTTP client) is what bounds the cross-replica fill. Sets up a
+// slow-paced TCP server that delivers a full Content-Length body
+// over ~250ms, and calls FillFromPeer with a 50ms ctx; expects the
+// read to fail with context.DeadlineExceeded.
+//
+// Companion to the wall-timeout removal: regression-tests that ctx
+// propagation still bounds the request even though the
+// Client.Timeout safety net is gone.
+func TestFillFromPeer_CtxDeadlineHonored(t *testing.T) {
+	t.Parallel()
+
+	const advertised = 1024
+
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("listen: %v", err)
+	}
+
+	t.Cleanup(func() { _ = ln.Close() }) //nolint:errcheck // test cleanup
+
+	go func() {
+		conn, err := ln.Accept()
+		if err != nil {
+			return
+		}
+
+		defer conn.Close() //nolint:errcheck // test cleanup
+
+		buf := make([]byte, 4096)
+		if _, err := conn.Read(buf); err != nil {
+			return
+		}
+
+		resp := "HTTP/1.1 200 OK\r\n" +
+			"Content-Length: " + strconv.Itoa(advertised) + "\r\n" +
+			"Content-Type: application/octet-stream\r\n" +
+			"\r\n"
+		if _, err := conn.Write([]byte(resp)); err != nil {
+			return
+		}
+		// Drip body bytes slowly: 64 bytes every 20ms (~ 320ms for
+		// the full 1 KiB), far exceeding the 50ms ctx deadline.
+		body := make([]byte, advertised)
+
+		for i := 0; i < advertised; i += 64 {
+			end := i + 64
+			if end > advertised {
+				end = advertised
+			}
+
+			if _, err := conn.Write(body[i:end]); err != nil {
+				return
+			}
+
+			time.Sleep(20 * time.Millisecond)
+		}
+	}()
+
+	host, portStr, err := net.SplitHostPort(ln.Addr().String())
+	if err != nil {
+		t.Fatalf("split host port: %v", err)
+	}
+
+	port, err := strconv.Atoi(portStr)
+	if err != nil {
+		t.Fatalf("parse port: %v", err)
+	}
+
+	c, err := New(t.Context(),
+		config.Cluster{
+			Service:           "test",
+			SelfPodIP:         "10.0.0.1",
+			MembershipRefresh: time.Hour,
+			InternalListen:    "0.0.0.0:8444",
+		},
+		WithPeerSource(&fakePeerSource{mu: func() ([]Peer, error) {
+			return []Peer{{IP: "10.0.0.1", Self: true}}, nil
+		}}),
+	)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+	peer := Peer{IP: host, Port: port}
+	key := chunk.Key{
+		OriginID:  "test-origin",
+		Bucket:    "test-bucket",
+		ObjectKey: "test-object",
+		ETag:      "test-etag",
+		ChunkSize: advertised,
+		Index:     0,
+	}
+
+	ctx, cancel := context.WithTimeout(t.Context(), 50*time.Millisecond)
+	defer cancel()
+
+	body, err := c.FillFromPeer(ctx, peer, key, advertised)
+	if err != nil {
+		if !errors.Is(err, context.DeadlineExceeded) {
+			t.Fatalf("FillFromPeer err = %v, want context.DeadlineExceeded (or success then deadline on read)", err)
+		}
+
+		return
+	}
+
+	defer body.Close() //nolint:errcheck // test cleanup
+
+	_, readErr := io.ReadAll(body)
+	if !errors.Is(readErr, context.DeadlineExceeded) {
+		t.Errorf("ReadAll err = %v, want context.DeadlineExceeded", readErr)
+	}
+}
+
+// TestWithHTTPClient_Overrides verifies the test seam: tests can
+// inject an alternate http.Client (used to give a deterministic
+// short timeout or custom transport behaviour).
+func TestWithHTTPClient_Overrides(t *testing.T) {
+	t.Parallel()
+
+	custom := &http.Client{Timeout: 42 * time.Millisecond}
+
+	c, err := New(t.Context(),
+		config.Cluster{
+			Service:           "test",
+			SelfPodIP:         "10.0.0.1",
+			MembershipRefresh: time.Hour,
+		},
+		WithPeerSource(&fakePeerSource{mu: func() ([]Peer, error) {
+			return []Peer{{IP: "10.0.0.1", Self: true}}, nil
+		}}),
+		WithHTTPClient(custom),
+	)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+	if c.httpClient != custom {
+		t.Errorf("httpClient not overridden by WithHTTPClient")
+	}
+}
+
+// TestWithLogger_OverridesDefault verifies the cluster honours the
+// injected slog.Logger so cluster.refresh's warn-level
+// retain-snapshot message and the debug-level emissions route to
+// the caller's configured handler rather than slog.Default.
+func TestWithLogger_OverridesDefault(t *testing.T) {
+	t.Parallel()
+
+	injected := slog.New(slog.NewTextHandler(io.Discard, nil))
+
+	c, err := New(t.Context(),
+		config.Cluster{
+			Service:           "test",
+			SelfPodIP:         "10.0.0.1",
+			MembershipRefresh: time.Hour,
+		},
+		WithPeerSource(&fakePeerSource{mu: func() ([]Peer, error) {
+			return []Peer{{IP: "10.0.0.1", Self: true}}, nil
+		}}),
+		WithLogger(injected),
+	)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+	if c.log != injected {
+		t.Errorf("Cluster.log not the injected logger")
+	}
+}
+
+// TestRefresh_EmitsMembershipTransition verifies that a peer-set
+// change (member added) surfaces a Info-level 'peer_set_changed'
+// log line. Stable refreshes (no delta) must not re-emit this line.
+func TestRefresh_EmitsMembershipTransition(t *testing.T) {
+	t.Parallel()
+
+	initial := []Peer{
+		{IP: "10.0.0.2", Self: true},
+	}
+
+	current := initial
+
+	src := &fakePeerSource{
+		mu: func() ([]Peer, error) {
+			out := make([]Peer, len(current))
+			copy(out, current)
+
+			return out, nil
+		},
+	}
+
+	var buf bytes.Buffer
+
+	log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}))
+
+	c, err := New(t.Context(),
+		config.Cluster{
+			Service:           "test",
+			SelfPodIP:         "10.0.0.2",
+			MembershipRefresh: time.Hour,
+		},
+		WithPeerSource(src),
+		WithLogger(log),
+	)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+	// Initial snapshot landed during New: peer_set_initial emitted.
+	if !strings.Contains(buf.String(), "peer_set_initial") {
+		t.Errorf("expected peer_set_initial on bootstrap; got %q", buf.String())
+	}
+
+	buf.Reset()
+
+	// Stable refresh: no delta -> only the debug peer_set_refreshed.
+	c.refresh(t.Context())
+
+	if strings.Contains(buf.String(), "peer_set_changed") {
+		t.Errorf("peer_set_changed should not fire when peer-set is stable; got %q", buf.String())
+	}
+
+	if !strings.Contains(buf.String(), "peer_set_refreshed") {
+		t.Errorf("expected per-cycle peer_set_refreshed; got %q", buf.String())
+	}
+
+	buf.Reset()
+
+	// Add a peer: peer_set_changed must fire with the 'added' key.
+	current = append([]Peer{}, initial...)
+	current = append(current, Peer{IP: "10.0.0.3"})
+
+	c.refresh(t.Context())
+
+	if !strings.Contains(buf.String(), "peer_set_changed") {
+		t.Errorf("peer_set_changed missing on add; got %q", buf.String())
+	}
+
+	if !strings.Contains(buf.String(), "10.0.0.3") {
+		t.Errorf("added peer IP missing from log; got %q", buf.String())
+	}
+}
+
+// TestCoordinator_EmitsDebugSelection verifies the per-call debug
+// emission carrying the chosen-peer and rendezvous score for a
+// chunk. Operators rely on this to diagnose routing surprises.
+func TestCoordinator_EmitsDebugSelection(t *testing.T) {
+	t.Parallel()
+
+	var buf bytes.Buffer
+
+	log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}))
+
+	c, err := New(t.Context(),
+		config.Cluster{
+			Service:           "test",
+			SelfPodIP:         "10.0.0.1",
+			MembershipRefresh: time.Hour,
+		},
+		WithPeerSource(&fakePeerSource{mu: func() ([]Peer, error) {
+			return []Peer{{IP: "10.0.0.1", Self: true}}, nil
+		}}),
+		WithLogger(log),
+	)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+	buf.Reset()
+
+	c.Coordinator(chunk.Key{
+		OriginID: "ox", Bucket: "b", ObjectKey: "o", ChunkSize: 1024, Index: 5,
+	})
+
+	out := buf.String()
+	for _, want := range []string{"coordinator_selected", "chosen_ip=10.0.0.1", "is_self=true", "index=5"} {
+		if !strings.Contains(out, want) {
+			t.Errorf("expected %q in coord debug output; got %q", want, out)
+		}
+	}
+}
+
+// TestRefresh_CtxCanceledDoesNotBumpErrorCounter verifies that a
+// refresh call whose ctx has been cancelled (the normal shutdown
+// path) does not bump consecutiveRefreshErrors or churn the stored
+// peer-set into the self-only fallback. Without this guard the
+// final refresh during graceful shutdown produces a 'discovery
+// failed' warning and pushes the membership into the self-only
+// path even though nothing has actually gone wrong.
+func TestRefresh_CtxCanceledDoesNotBumpErrorCounter(t *testing.T) {
+	t.Parallel()
+
+	good := []Peer{
+		{IP: "10.0.0.1", Self: false},
+		{IP: "10.0.0.2", Self: true},
+	}
+
+	var failWithCancel atomic.Bool
+
+	src := &fakePeerSource{
+		mu: func() ([]Peer, error) {
+			if failWithCancel.Load() {
+				return nil, context.Canceled
+			}
+
+			out := make([]Peer, len(good))
+			copy(out, good)
+
+			return out, nil
+		},
+	}
+
+	c, err := New(t.Context(),
+		config.Cluster{
+			Service:           "test",
+			SelfPodIP:         "10.0.0.2",
+			MembershipRefresh: time.Hour, // disable auto-refresh; drive manually.
+		},
+		WithPeerSource(src),
+	)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+
+	t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+	if got := c.consecutiveRefreshErrors.Load(); got != 0 {
+		t.Fatalf("pre-test error counter = %d, want 0", got)
+	}
+
+	initialPeers := len(c.Peers())
+
+	failWithCancel.Store(true)
+	c.refresh(t.Context())
+
+	if got := c.consecutiveRefreshErrors.Load(); got != 0 {
+		t.Errorf("counter bumped on ctx.Canceled; got %d want 0", got)
+	}
+
+	if got := len(c.Peers()); got != initialPeers {
+		t.Errorf("peer-set churned on ctx.Canceled; got %d want %d", got, initialPeers)
+	}
+}
+
+// TestDecodeChunkKey_RejectsZeroObjectSize verifies that the wire
+// boundary rejects object_size == 0 as well as negative values.
+// The previous code accepted 0 as a sentinel for "unknown size"
+// which became a foot-gun (validation skipped, malformed range,
+// validating-reader bypassed); production callers always know the
+// size from a prior Head, so tightening the contract removes the
+// foot-gun without breaking any real caller.
+//
+// Regression for C-2 / C-3 / C-4.
+func TestDecodeChunkKey_RejectsZeroObjectSize(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name       string
+		objectSize string
+		wantErr    bool
+	}{
+		{"zero rejected", "0", true},
+		{"negative rejected", "-1", true},
+		{"positive accepted", "1024", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			v := url.Values{}
+			v.Set("origin_id", "ox")
+			v.Set("bucket", "b")
+			v.Set("key", "o")
+			v.Set("etag", "e1")
+			v.Set("chunk_size", "1024")
+			v.Set("index", "0")
+			v.Set("object_size", tt.objectSize)
+
+			_, _, err := DecodeChunkKey(v)
+			if tt.wantErr {
+				if err == nil {
+					t.Errorf("DecodeChunkKey(object_size=%s) returned nil; want error", tt.objectSize)
+				} else if !strings.Contains(err.Error(), "object_size") {
+					t.Errorf("error does not mention object_size: %v", err)
+				}
+
+				return
+			}
+
+			if err != nil {
+				t.Errorf("DecodeChunkKey(object_size=%s) unexpected error: %v", tt.objectSize, err)
+			}
+		})
+	}
+}
diff --git a/internal/orca/config/config.go b/internal/orca/config/config.go
new file mode 100644
index 00000000..dd86e855
--- /dev/null
+++ b/internal/orca/config/config.go
@@ -0,0 +1,544 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package config defines Orca's YAML configuration shape and loading
+// helpers.
+//
+// The schema is an intentional subset of the full Orca configuration
+// surface; extending it later is a matter of adding fields and keeping
+// zero-values backward-compatible.
+package config
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"strings"
+	"time"
+
+	"gopkg.in/yaml.v3"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+)
+
+// Config is the top-level Orca configuration.
+type Config struct {
+	Server       Server       `yaml:"server"`
+	Origin       Origin       `yaml:"origin"`
+	Cachestore   Cachestore   `yaml:"cachestore"`
+	Cluster      Cluster      `yaml:"cluster"`
+	ChunkCatalog ChunkCatalog `yaml:"chunk_catalog"`
+	Metadata     Metadata     `yaml:"metadata"`
+	Chunking     Chunking     `yaml:"chunking"`
+	Logging      Logging      `yaml:"logging"`
+}
+
+// Logging governs structured-log output. The level controls slog
+// emission filtering; debug surfaces per-request and per-chunk
+// tracing through the fetch coordinator, metadata cache, chunk
+// catalog, cluster, cachestore, and origin drivers.
+//
+// The ORCA_LOG_LEVEL environment variable, if set and non-empty,
+// overrides the YAML-configured Level at process startup. Useful
+// for one-shot debug sessions without re-rendering the configmap.
+type Logging struct {
+	// Level is one of "debug", "info", "warn", "error". Empty
+	// defaults to "info".
+	Level string `yaml:"level"`
+}
+
+// Server holds the client-edge listener configuration plus the
+// ops listener used for kubelet probes (/healthz and /readyz).
+type Server struct {
+	Listen string     `yaml:"listen"`
+	Auth   ServerAuth `yaml:"auth"`
+
+	// OpsListen is the bind address for the operations endpoint
+	// hosting /healthz and /readyz. Plain HTTP, no auth. Kubelet
+	// liveness and readiness probes target this address; production
+	// Service objects do not forward this port externally.
+	OpsListen string `yaml:"ops_listen"`
+}
+
+// ServerAuth governs the client-edge authentication path.
+//
+// Production: enabled=true with mode=bearer or mode=mtls.
+// Dev: enabled=false disables authentication entirely (no token
+// or client cert required). This is a single security knob, not a
+// dev_mode flag.
+type ServerAuth struct {
+	Enabled          bool   `yaml:"enabled"`
+	Mode             string `yaml:"mode"`
+	BearerSecretFile string `yaml:"bearer_secret_file"`
+}
+
+// Origin describes the upstream origin (Azure Blob or AWS S3 in v1).
+type Origin struct {
+	ID           string        `yaml:"id"`
+	Driver       string        `yaml:"driver"` // "azureblob" or "awss3"
+	TargetGlobal int           `yaml:"target_global"`
+	QueueTimeout time.Duration `yaml:"queue_timeout"`
+	Retry        OriginRetry   `yaml:"retry"`
+	Azureblob    Azureblob     `yaml:"azureblob"`
+	AWSS3        AWSS3         `yaml:"awss3"`
+}
+
+// OriginRetry captures the leader-side pre-header retry budget.
+type OriginRetry struct {
+	Attempts         int           `yaml:"attempts"`
+	BackoffInitial   time.Duration `yaml:"backoff_initial"`
+	BackoffMax       time.Duration `yaml:"backoff_max"`
+	MaxTotalDuration time.Duration `yaml:"max_total_duration"`
+}
+
+// Azureblob is the azureblob origin adapter configuration.
+//
+// Page and Append blobs are unconditionally rejected at Head: their
+// random-access mutation model is incompatible with the chunked,
+// immutable cache contract orca relies on. There is no configuration
+// switch for this behaviour.
+type Azureblob struct {
+	Account    string `yaml:"account"`
+	AccountKey string `yaml:"account_key"`
+	Container  string `yaml:"container"`
+
+	// Endpoint, when set, overrides the default Azure Blob service URL
+	// (https://<account>.blob.core.windows.net/). Used in dev to point
+	// at Azurite (http://azurite:10000/devstoreaccount1) so the
+	// azureblob driver path can be exercised without a real Azure
+	// account.
+	Endpoint string `yaml:"endpoint"`
+}
+
+// AWSS3 is the awss3 origin adapter configuration. In dev this points
+// at LocalStack alongside the cachestore (different bucket); in
+// production it points at real AWS S3 with no Endpoint override.
+type AWSS3 struct {
+	Endpoint     string `yaml:"endpoint"` // empty for real AWS S3
+	Region       string `yaml:"region"`
+	Bucket       string `yaml:"bucket"`
+	AccessKey    string `yaml:"access_key"`
+	SecretKey    string `yaml:"secret_key"`
+	UsePathStyle bool   `yaml:"use_path_style"` // true for LocalStack
+}
+
+// Cachestore is the in-DC chunk store configuration.
+type Cachestore struct {
+	Driver string       `yaml:"driver"` // "s3" in v1
+	S3     CachestoreS3 `yaml:"s3"`
+}
+
+// CachestoreS3 is the s3 driver configuration. In dev this points at
+// LocalStack; in production at VAST or another in-DC S3-compatible
+// store.
+//
+// Bucket versioning is unconditionally validated at startup: a
+// versioned bucket silently breaks the no-clobber atomic-commit
+// primitive (PutObject + If-None-Match: *) the driver depends on.
+// There is no configuration switch for this gate.
+type CachestoreS3 struct {
+	Endpoint     string `yaml:"endpoint"`
+	Bucket       string `yaml:"bucket"`
+	Region       string `yaml:"region"`
+	AccessKey    string `yaml:"access_key"`
+	SecretKey    string `yaml:"secret_key"`
+	UsePathStyle bool   `yaml:"use_path_style"` // true for LocalStack
+}
+
+// Cluster captures peer discovery + internal-listener configuration.
+type Cluster struct {
+	Service           string        `yaml:"service"`            // headless Service FQDN
+	MembershipRefresh time.Duration `yaml:"membership_refresh"` // DNS poll interval
+	InternalListen    string        `yaml:"internal_listen"`
+	InternalTLS       InternalTLS   `yaml:"internal_tls"`
+	TargetReplicas    int           `yaml:"target_replicas"`
+	SelfPodIP         string        `yaml:"self_pod_ip"` // resolved from POD_IP env
+}
+
+// InternalTLS governs the internal-listener mTLS posture.
+//
+// Production: enabled=true (mTLS required).
+// Dev: enabled=false (plain HTTP/2). The binary logs WARN at startup.
+type InternalTLS struct {
+	Enabled    bool   `yaml:"enabled"`
+	CertFile   string `yaml:"cert_file"`
+	KeyFile    string `yaml:"key_file"`
+	CAFile     string `yaml:"ca_file"`
+	ServerName string `yaml:"server_name"`
+}
+
+// ChunkCatalog is the in-memory chunk-presence cache configuration.
+type ChunkCatalog struct {
+	MaxEntries int `yaml:"max_entries"`
+}
+
+// Metadata is the object-metadata cache configuration.
+type Metadata struct {
+	TTL         time.Duration `yaml:"ttl"`
+	NegativeTTL time.Duration `yaml:"negative_ttl"`
+	MaxEntries  int           `yaml:"max_entries"`
+}
+
+// Chunking governs chunk size and read-ahead for client GETs.
+//
+// Size is the base chunk size used for objects smaller than the
+// smallest Tier threshold. Tiers, if non-empty, override Size for
+// objects at or above each tier's MinObjectSize: the tier with the
+// largest threshold <= the object's size wins. Tiers must be
+// strictly ascending by MinObjectSize; the loader enforces this
+// at validate time so the runtime selection path can assume sorted
+// input.
+//
+// Readahead is the number of chunks the client-edge GET handler
+// prefetches while streaming the current chunk to the client. It
+// is a pointer so the loader can distinguish an omitted YAML field
+// (defaults to 8) from an explicit "readahead: 0" (disables
+// read-ahead and restores the strictly-sequential chunk-fetch
+// behavior). The cost is bounded by readahead * effective_chunk_size
+// of extra in-flight cachestore body buffers per concurrent GET;
+// cold-fill speculation is additionally bounded by the per-replica
+// origin semaphore (target_per_replica), so peak per-replica
+// cold-buffer memory is at most:
+//
+//	target_per_replica * max(Size, max ChunkSize across Tiers)
+//
+// With the defaults (Size=8 MiB, Tiers up to 128 MiB, 4 replicas at
+// target_global=64), the per-replica ceiling is 16 * 128 MiB = 2 GiB.
+// Operators with tighter memory budgets should lower the highest
+// tier's ChunkSize or drop the largest-object tier entirely.
+type Chunking struct {
+	Size      int64       `yaml:"size"` // bytes per chunk; default 8 MiB
+	Tiers     []ChunkTier `yaml:"tiers"`
+	Readahead *int        `yaml:"readahead"`
+}
+
+// ChunkTier is one entry in the Chunking.Tiers ladder. Objects whose
+// size is at or above MinObjectSize use ChunkSize, unless a
+// higher-threshold tier also matches (in which case the higher tier
+// wins). Both fields must be > 0; ChunkSize must be >= 1 MiB (the
+// floor that applies to Chunking.Size as well).
+type ChunkTier struct {
+	MinObjectSize int64 `yaml:"min_object_size"`
+	ChunkSize     int64 `yaml:"chunk_size"`
+}
+
+// AsChunkTiers returns the configured tier ladder as a []chunk.Tier
+// slice suitable for chunk.SizeFor. Returns nil for an empty list.
+// The slice is in the validated ascending-MinObjectSize order.
+func (c Chunking) AsChunkTiers() []chunk.Tier {
+	if len(c.Tiers) == 0 {
+		return nil
+	}
+
+	out := make([]chunk.Tier, len(c.Tiers))
+	for i, t := range c.Tiers {
+		out[i] = chunk.Tier{MinObjectSize: t.MinObjectSize, ChunkSize: t.ChunkSize}
+	}
+
+	return out
+}
+
+// ReadaheadDepth returns the configured read-ahead depth. A nil
+// pointer (YAML omitted) returns 0; applyDefaults populates the
+// default-on value so configurations that loaded through Load
+// always have a non-nil pointer. Callers that bypass Load (e.g.
+// hand-constructed test configs) get 0 for nil, which matches the
+// "feature disabled" semantics.
+func (c Chunking) ReadaheadDepth() int {
+	if c.Readahead == nil {
+		return 0
+	}
+
+	return *c.Readahead
+}
+
+// Load reads the YAML config file at path and returns a populated
+// Config. Defaults are applied for fields left at zero-value.
+func Load(path string) (*Config, error) {
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("read %s: %w", path, err)
+	}
+
+	cfg := &Config{}
+	if err := yaml.Unmarshal(raw, cfg); err != nil {
+		return nil, fmt.Errorf("yaml unmarshal: %w", err)
+	}
+
+	cfg.applyDefaults()
+
+	if err := cfg.validate(); err != nil {
+		return nil, fmt.Errorf("config invalid: %w", err)
+	}
+
+	return cfg, nil
+}
+
+func (c *Config) applyDefaults() {
+	// Server.
+	if c.Server.Listen == "" {
+		c.Server.Listen = "0.0.0.0:8443"
+	}
+
+	if c.Server.OpsListen == "" {
+		c.Server.OpsListen = "0.0.0.0:8442"
+	}
+	// Origin.
+	if c.Origin.Driver == "" {
+		c.Origin.Driver = "azureblob"
+	}
+
+	if c.Origin.TargetGlobal == 0 {
+		c.Origin.TargetGlobal = 192
+	}
+
+	if c.Origin.QueueTimeout == 0 {
+		c.Origin.QueueTimeout = 5 * time.Second
+	}
+
+	if c.Origin.Retry.Attempts == 0 {
+		c.Origin.Retry.Attempts = 3
+	}
+
+	if c.Origin.Retry.BackoffInitial == 0 {
+		c.Origin.Retry.BackoffInitial = 100 * time.Millisecond
+	}
+
+	if c.Origin.Retry.BackoffMax == 0 {
+		c.Origin.Retry.BackoffMax = 2 * time.Second
+	}
+
+	if c.Origin.Retry.MaxTotalDuration == 0 {
+		c.Origin.Retry.MaxTotalDuration = 5 * time.Second
+	}
+	// Cachestore.
+	if c.Cachestore.Driver == "" {
+		c.Cachestore.Driver = "s3"
+	}
+
+	if c.Cachestore.S3.Region == "" {
+		c.Cachestore.S3.Region = "us-east-1"
+	}
+	// Cluster.
+	if c.Cluster.MembershipRefresh == 0 {
+		c.Cluster.MembershipRefresh = 5 * time.Second
+	}
+
+	if c.Cluster.InternalListen == "" {
+		c.Cluster.InternalListen = "0.0.0.0:8444"
+	}
+
+	if c.Cluster.TargetReplicas == 0 {
+		c.Cluster.TargetReplicas = 3
+	}
+
+	if c.Cluster.InternalTLS.ServerName == "" {
+		c.Cluster.InternalTLS.ServerName = "orca.<ns>.svc"
+	}
+	// Resolve self pod IP from env if not set in YAML.
+	if c.Cluster.SelfPodIP == "" {
+		c.Cluster.SelfPodIP = os.Getenv("POD_IP")
+	}
+	// Resolve credentials from env if not set in YAML. This lets the
+	// non-secret config live in a ConfigMap while credentials come from
+	// a Kubernetes Secret mounted as env vars (envFrom: secretRef).
+	if c.Origin.Azureblob.AccountKey == "" {
+		c.Origin.Azureblob.AccountKey = os.Getenv("ORCA_AZUREBLOB_ACCOUNT_KEY")
+	}
+
+	if c.Origin.AWSS3.AccessKey == "" {
+		c.Origin.AWSS3.AccessKey = os.Getenv("ORCA_AWSS3_ACCESS_KEY")
+	}
+
+	if c.Origin.AWSS3.SecretKey == "" {
+		c.Origin.AWSS3.SecretKey = os.Getenv("ORCA_AWSS3_SECRET_KEY")
+	}
+
+	if c.Cachestore.S3.AccessKey == "" {
+		c.Cachestore.S3.AccessKey = os.Getenv("ORCA_CACHESTORE_S3_ACCESS_KEY")
+	}
+
+	if c.Cachestore.S3.SecretKey == "" {
+		c.Cachestore.S3.SecretKey = os.Getenv("ORCA_CACHESTORE_S3_SECRET_KEY")
+	}
+	// awss3 region default.
+	if c.Origin.AWSS3.Region == "" {
+		c.Origin.AWSS3.Region = "us-east-1"
+	}
+	// Chunk catalog.
+	if c.ChunkCatalog.MaxEntries == 0 {
+		c.ChunkCatalog.MaxEntries = 100_000
+	}
+	// Metadata.
+	if c.Metadata.TTL == 0 {
+		c.Metadata.TTL = 5 * time.Minute
+	}
+
+	if c.Metadata.NegativeTTL == 0 {
+		c.Metadata.NegativeTTL = 60 * time.Second
+	}
+
+	if c.Metadata.MaxEntries == 0 {
+		c.Metadata.MaxEntries = 10_000
+	}
+	// Chunking.
+	if c.Chunking.Size == 0 {
+		c.Chunking.Size = 8 * 1024 * 1024
+	}
+	// Tier ladder: default to a two-tier ramp that keeps small
+	// objects on the 8 MiB base size, bumps 1 GiB+ blobs to 64 MiB,
+	// and 10 GiB+ blobs to 128 MiB. Operators can replace or
+	// disable the ladder by setting tiers explicitly (including the
+	// empty list) in YAML.
+	if c.Chunking.Tiers == nil {
+		c.Chunking.Tiers = []ChunkTier{
+			{MinObjectSize: 1024 * 1024 * 1024, ChunkSize: 64 * 1024 * 1024},
+			{MinObjectSize: 10 * 1024 * 1024 * 1024, ChunkSize: 128 * 1024 * 1024},
+		}
+	}
+	// Readahead defaults to 8 chunks when the YAML field is omitted.
+	// An explicit "readahead: 0" disables prefetch.
+	if c.Chunking.Readahead == nil {
+		d := 8
+		c.Chunking.Readahead = &d
+	}
+	// Logging.
+	if c.Logging.Level == "" {
+		c.Logging.Level = "info"
+	}
+}
+
+func (c *Config) validate() error {
+	if c.Origin.ID == "" {
+		return fmt.Errorf("origin.id is required")
+	}
+
+	switch c.Origin.Driver {
+	case "azureblob":
+		if c.Origin.Azureblob.Account == "" {
+			return fmt.Errorf("origin.azureblob.account is required")
+		}
+
+		if c.Origin.Azureblob.Container == "" {
+			return fmt.Errorf("origin.azureblob.container is required")
+		}
+	case "awss3":
+		if c.Origin.AWSS3.Bucket == "" {
+			return fmt.Errorf("origin.awss3.bucket is required")
+		}
+	default:
+		return fmt.Errorf("origin.driver %q unsupported; supported: azureblob, awss3",
+			c.Origin.Driver)
+	}
+
+	if c.Cachestore.Driver != "s3" {
+		return fmt.Errorf("cachestore.driver %q unsupported; only s3 in v1", c.Cachestore.Driver)
+	}
+
+	if c.Cachestore.S3.Endpoint == "" {
+		return fmt.Errorf("cachestore.s3.endpoint is required")
+	}
+
+	if c.Cachestore.S3.Bucket == "" {
+		return fmt.Errorf("cachestore.s3.bucket is required")
+	}
+
+	if c.Cluster.Service == "" {
+		return fmt.Errorf("cluster.service is required (headless Service FQDN)")
+	}
+
+	if c.Cluster.SelfPodIP == "" {
+		return fmt.Errorf("cluster.self_pod_ip is required (typically resolved from POD_IP env)")
+	}
+
+	if c.Cluster.TargetReplicas < 1 {
+		return fmt.Errorf("cluster.target_replicas must be >= 1")
+	}
+
+	if c.Origin.TargetGlobal < c.Cluster.TargetReplicas {
+		return fmt.Errorf(
+			"origin.target_global=%d must be >= cluster.target_replicas=%d",
+			c.Origin.TargetGlobal, c.Cluster.TargetReplicas,
+		)
+	}
+
+	if c.Chunking.Size < 1024*1024 {
+		return fmt.Errorf("chunking.size %d too small; minimum 1 MiB", c.Chunking.Size)
+	}
+
+	if err := validateChunkingTiers(c.Chunking.Tiers); err != nil {
+		return err
+	}
+
+	if c.Chunking.Readahead != nil && *c.Chunking.Readahead < 0 {
+		return fmt.Errorf("chunking.readahead %d invalid; must be >= 0", *c.Chunking.Readahead)
+	}
+
+	if _, err := ParseLogLevel(c.Logging.Level); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// validateChunkingTiers enforces the unambiguous-tier invariants the
+// SizeFor selection rule depends on: every tier has positive bounds,
+// the ChunkSize floor matches Chunking.Size's 1 MiB minimum, and
+// MinObjectSize values are strictly ascending. Unsorted input is
+// rejected (rather than silently sorted) so operators see the typo
+// in their YAML rather than diagnosing a surprising chunk-size
+// selection in production.
+func validateChunkingTiers(tiers []ChunkTier) error {
+	for i, t := range tiers {
+		if t.MinObjectSize <= 0 {
+			return fmt.Errorf("chunking.tiers[%d].min_object_size %d invalid; must be > 0",
+				i, t.MinObjectSize)
+		}
+
+		if t.ChunkSize < 1024*1024 {
+			return fmt.Errorf("chunking.tiers[%d].chunk_size %d too small; minimum 1 MiB",
+				i, t.ChunkSize)
+		}
+
+		if i > 0 && t.MinObjectSize <= tiers[i-1].MinObjectSize {
+			return fmt.Errorf(
+				"chunking.tiers must be strictly ascending by min_object_size; "+
+					"tiers[%d].min_object_size=%d is not greater than tiers[%d].min_object_size=%d",
+				i, t.MinObjectSize, i-1, tiers[i-1].MinObjectSize)
+		}
+	}
+
+	return nil
+}
+
+// ParseLogLevel maps an orca log-level string to slog.Level. Returns
+// an error for unknown values. Empty string is treated as the
+// configured default ("info"). Used both by config.validate at YAML
+// parse time and by the cmd/orca entrypoint to honour the
+// ORCA_LOG_LEVEL environment override.
+func ParseLogLevel(s string) (slog.Level, error) {
+	switch strings.ToLower(strings.TrimSpace(s)) {
+	case "", "info":
+		return slog.LevelInfo, nil
+	case "debug":
+		return slog.LevelDebug, nil
+	case "warn", "warning":
+		return slog.LevelWarn, nil
+	case "error":
+		return slog.LevelError, nil
+	default:
+		return 0, fmt.Errorf("logging.level %q invalid; expected one of debug, info, warn, error", s)
+	}
+}
+
+// TargetPerReplica returns the per-replica origin concurrency cap
+// derived from origin.target_global divided by cluster.target_replicas.
+// This bounds the number of concurrent in-flight origin requests this
+// replica will issue.
+func (c *Config) TargetPerReplica() int {
+	if c.Cluster.TargetReplicas <= 0 {
+		return c.Origin.TargetGlobal
+	}
+
+	return c.Origin.TargetGlobal / c.Cluster.TargetReplicas
+}
diff --git a/internal/orca/config/config_test.go b/internal/orca/config/config_test.go
new file mode 100644
index 00000000..2d64a097
--- /dev/null
+++ b/internal/orca/config/config_test.go
@@ -0,0 +1,635 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package config
+
+import (
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+)
+
+// TestApplyDefaults_EnvFallback verifies that applyDefaults populates
+// credential / pod-identity fields from environment variables when
+// the YAML omits them. This is the path used in production where the
+// Kubernetes Secret is mounted via envFrom and the ConfigMap holds
+// only the non-secret config.
+//
+// Each subtest sets one env var and checks that:
+//   - env-set, yaml-empty -> field populated from env.
+//   - env-unset, yaml-set -> field keeps yaml value.
+//   - env-set, yaml-set   -> field keeps yaml value (yaml wins).
+//   - env-unset, yaml-empty -> field stays empty.
+func TestApplyDefaults_EnvFallback(t *testing.T) {
+	tests := []struct {
+		envVar string
+		setVal func(c *Config, v string)
+		getVal func(c *Config) string
+	}{
+		{
+			envVar: "POD_IP",
+			setVal: func(c *Config, v string) { c.Cluster.SelfPodIP = v },
+			getVal: func(c *Config) string { return c.Cluster.SelfPodIP },
+		},
+		{
+			envVar: "ORCA_AZUREBLOB_ACCOUNT_KEY",
+			setVal: func(c *Config, v string) { c.Origin.Azureblob.AccountKey = v },
+			getVal: func(c *Config) string { return c.Origin.Azureblob.AccountKey },
+		},
+		{
+			envVar: "ORCA_AWSS3_ACCESS_KEY",
+			setVal: func(c *Config, v string) { c.Origin.AWSS3.AccessKey = v },
+			getVal: func(c *Config) string { return c.Origin.AWSS3.AccessKey },
+		},
+		{
+			envVar: "ORCA_AWSS3_SECRET_KEY",
+			setVal: func(c *Config, v string) { c.Origin.AWSS3.SecretKey = v },
+			getVal: func(c *Config) string { return c.Origin.AWSS3.SecretKey },
+		},
+		{
+			envVar: "ORCA_CACHESTORE_S3_ACCESS_KEY",
+			setVal: func(c *Config, v string) { c.Cachestore.S3.AccessKey = v },
+			getVal: func(c *Config) string { return c.Cachestore.S3.AccessKey },
+		},
+		{
+			envVar: "ORCA_CACHESTORE_S3_SECRET_KEY",
+			setVal: func(c *Config, v string) { c.Cachestore.S3.SecretKey = v },
+			getVal: func(c *Config) string { return c.Cachestore.S3.SecretKey },
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.envVar, func(t *testing.T) {
+			t.Run("env_set/yaml_empty", func(t *testing.T) {
+				t.Setenv(tt.envVar, "from-env")
+
+				c := &Config{}
+				c.applyDefaults()
+
+				if got := tt.getVal(c); got != "from-env" {
+					t.Errorf("got %q want %q", got, "from-env")
+				}
+			})
+
+			t.Run("env_unset/yaml_set", func(t *testing.T) {
+				_ = os.Unsetenv(tt.envVar) //nolint:errcheck // best-effort
+
+				c := &Config{}
+				tt.setVal(c, "from-yaml")
+				c.applyDefaults()
+
+				if got := tt.getVal(c); got != "from-yaml" {
+					t.Errorf("got %q want %q", got, "from-yaml")
+				}
+			})
+
+			t.Run("env_set/yaml_set_yaml_wins", func(t *testing.T) {
+				t.Setenv(tt.envVar, "from-env")
+
+				c := &Config{}
+				tt.setVal(c, "from-yaml")
+				c.applyDefaults()
+
+				if got := tt.getVal(c); got != "from-yaml" {
+					t.Errorf("got %q want %q (yaml should win)", got, "from-yaml")
+				}
+			})
+
+			t.Run("env_unset/yaml_empty", func(t *testing.T) {
+				_ = os.Unsetenv(tt.envVar) //nolint:errcheck // best-effort
+
+				c := &Config{}
+				c.applyDefaults()
+
+				if got := tt.getVal(c); got != "" {
+					t.Errorf("got %q want empty", got)
+				}
+			})
+		})
+	}
+}
+
+// TestApplyDefaults_FieldDefaults verifies that the hard-coded
+// fallback values fire for every field whose zero value is replaced.
+func TestApplyDefaults_FieldDefaults(t *testing.T) {
+	t.Parallel()
+
+	c := &Config{}
+	c.applyDefaults()
+
+	checks := []struct {
+		name string
+		got  any
+		want any
+	}{
+		{"server.listen", c.Server.Listen, "0.0.0.0:8443"},
+		{"server.ops_listen", c.Server.OpsListen, "0.0.0.0:8442"},
+		{"origin.driver", c.Origin.Driver, "azureblob"},
+		{"origin.target_global", c.Origin.TargetGlobal, 192},
+		{"origin.queue_timeout", c.Origin.QueueTimeout, 5 * time.Second},
+		{"origin.retry.attempts", c.Origin.Retry.Attempts, 3},
+		{"origin.retry.backoff_initial", c.Origin.Retry.BackoffInitial, 100 * time.Millisecond},
+		{"origin.retry.backoff_max", c.Origin.Retry.BackoffMax, 2 * time.Second},
+		{"origin.retry.max_total_duration", c.Origin.Retry.MaxTotalDuration, 5 * time.Second},
+		{"cachestore.driver", c.Cachestore.Driver, "s3"},
+		{"cachestore.s3.region", c.Cachestore.S3.Region, "us-east-1"},
+		{"cluster.membership_refresh", c.Cluster.MembershipRefresh, 5 * time.Second},
+		{"cluster.internal_listen", c.Cluster.InternalListen, "0.0.0.0:8444"},
+		{"cluster.target_replicas", c.Cluster.TargetReplicas, 3},
+		{"cluster.internal_tls.server_name", c.Cluster.InternalTLS.ServerName, "orca.<ns>.svc"},
+		{"chunk_catalog.max_entries", c.ChunkCatalog.MaxEntries, 100_000},
+		{"metadata.ttl", c.Metadata.TTL, 5 * time.Minute},
+		{"metadata.negative_ttl", c.Metadata.NegativeTTL, 60 * time.Second},
+		{"metadata.max_entries", c.Metadata.MaxEntries, 10_000},
+		{"chunking.size", c.Chunking.Size, int64(8 * 1024 * 1024)},
+		{"origin.awss3.region", c.Origin.AWSS3.Region, "us-east-1"},
+		{"logging.level", c.Logging.Level, "info"},
+	}
+
+	for _, ch := range checks {
+		if ch.got != ch.want {
+			t.Errorf("%s: got %v want %v", ch.name, ch.got, ch.want)
+		}
+	}
+
+	// Tiers default to the documented 2-entry ladder. Compared
+	// separately since slice equality cannot use the table.
+	wantTiers := []ChunkTier{
+		{MinObjectSize: 1024 * 1024 * 1024, ChunkSize: 64 * 1024 * 1024},
+		{MinObjectSize: 10 * 1024 * 1024 * 1024, ChunkSize: 128 * 1024 * 1024},
+	}
+	if len(c.Chunking.Tiers) != len(wantTiers) {
+		t.Errorf("chunking.tiers length=%d want %d", len(c.Chunking.Tiers), len(wantTiers))
+	} else {
+		for i := range wantTiers {
+			if c.Chunking.Tiers[i] != wantTiers[i] {
+				t.Errorf("chunking.tiers[%d]=%+v want %+v",
+					i, c.Chunking.Tiers[i], wantTiers[i])
+			}
+		}
+	}
+	// Readahead defaults to a non-nil pointer to 8.
+	if c.Chunking.Readahead == nil {
+		t.Errorf("chunking.readahead is nil; expected default pointer")
+	} else if *c.Chunking.Readahead != 8 {
+		t.Errorf("chunking.readahead=%d want 8", *c.Chunking.Readahead)
+	}
+}
+
+// TestApplyDefaults_PreservesExplicitValues verifies that explicit
+// non-zero values are not overwritten by applyDefaults.
+func TestApplyDefaults_PreservesExplicitValues(t *testing.T) {
+	t.Parallel()
+
+	c := &Config{
+		Server: Server{Listen: "1.2.3.4:9000"},
+		Origin: Origin{
+			Driver:       "awss3",
+			TargetGlobal: 64,
+		},
+		Cachestore:   Cachestore{S3: CachestoreS3{Region: "eu-west-1"}},
+		Cluster:      Cluster{TargetReplicas: 7, MembershipRefresh: 10 * time.Second},
+		ChunkCatalog: ChunkCatalog{MaxEntries: 50},
+		Metadata:     Metadata{TTL: time.Hour, MaxEntries: 99},
+		Chunking:     Chunking{Size: 16 << 20},
+	}
+
+	c.applyDefaults()
+
+	if c.Server.Listen != "1.2.3.4:9000" {
+		t.Errorf("Server.Listen overwritten: %q", c.Server.Listen)
+	}
+
+	if c.Origin.Driver != "awss3" {
+		t.Errorf("Origin.Driver overwritten: %q", c.Origin.Driver)
+	}
+
+	if c.Origin.TargetGlobal != 64 {
+		t.Errorf("Origin.TargetGlobal overwritten: %d", c.Origin.TargetGlobal)
+	}
+
+	if c.Cachestore.S3.Region != "eu-west-1" {
+		t.Errorf("Cachestore.S3.Region overwritten: %q", c.Cachestore.S3.Region)
+	}
+
+	if c.Cluster.TargetReplicas != 7 {
+		t.Errorf("Cluster.TargetReplicas overwritten: %d", c.Cluster.TargetReplicas)
+	}
+
+	if c.Cluster.MembershipRefresh != 10*time.Second {
+		t.Errorf("Cluster.MembershipRefresh overwritten: %v", c.Cluster.MembershipRefresh)
+	}
+
+	if c.ChunkCatalog.MaxEntries != 50 {
+		t.Errorf("ChunkCatalog.MaxEntries overwritten: %d", c.ChunkCatalog.MaxEntries)
+	}
+
+	if c.Metadata.TTL != time.Hour {
+		t.Errorf("Metadata.TTL overwritten: %v", c.Metadata.TTL)
+	}
+
+	if c.Chunking.Size != 16<<20 {
+		t.Errorf("Chunking.Size overwritten: %d", c.Chunking.Size)
+	}
+}
+
+// TestLoad_Validate covers the validate() error paths.
+func TestLoad_Validate(t *testing.T) {
+	// No t.Parallel: subtests use t.Setenv to neutralize POD_IP.
+	tests := []struct {
+		name    string
+		yaml    string
+		wantErr string
+		wantOK  bool
+	}{
+		{
+			name:   "valid awss3 config",
+			yaml:   validAwss3YAML,
+			wantOK: true,
+		},
+		{
+			name:    "missing origin.id",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "id: test-origin", "id: \"\""),
+			wantErr: "origin.id is required",
+		},
+		{
+			name:    "unsupported driver",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "driver: awss3", "driver: ftp"),
+			wantErr: "origin.driver",
+		},
+		{
+			name:    "missing awss3 bucket",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "bucket: orca-origin", "bucket: \"\""),
+			wantErr: "origin.awss3.bucket is required",
+		},
+		{
+			name:    "missing cachestore endpoint",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "endpoint: http://localstack:4566", "endpoint: \"\""),
+			wantErr: "cachestore.s3.endpoint is required",
+		},
+		{
+			name:    "missing cluster service",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "service: orca-peers.svc", "service: \"\""),
+			wantErr: "cluster.service is required",
+		},
+		{
+			name:    "missing self_pod_ip when POD_IP unset",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "self_pod_ip: 10.0.0.1", "self_pod_ip: \"\""),
+			wantErr: "self_pod_ip is required",
+		},
+		{
+			name:    "target_replicas negative",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "target_replicas: 3", "target_replicas: -1"),
+			wantErr: "target_replicas",
+		},
+		{
+			name:    "chunking size below minimum",
+			yaml:    strings.ReplaceAll(validAwss3YAML, "size: 8388608", "size: 4096"),
+			wantErr: "chunking.size",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Ensure no leakage of POD_IP from the test process env.
+			t.Setenv("POD_IP", "")
+
+			path := writeTempYAML(t, tt.yaml)
+
+			_, err := Load(path)
+			if tt.wantOK {
+				if err != nil {
+					t.Fatalf("expected nil error, got %v", err)
+				}
+
+				return
+			}
+
+			if err == nil {
+				t.Fatalf("expected error containing %q, got nil", tt.wantErr)
+			}
+
+			if !strings.Contains(err.Error(), tt.wantErr) {
+				t.Errorf("error %q does not contain %q", err.Error(), tt.wantErr)
+			}
+		})
+	}
+}
+
+// TestValidateChunkingTiers_OK covers tier ladders that should pass
+// validation: empty (feature off), single tier, multi-tier strictly
+// ascending.
+func TestValidateChunkingTiers_OK(t *testing.T) {
+	t.Parallel()
+
+	cases := [][]ChunkTier{
+		nil,
+		{},
+		{{MinObjectSize: 1 << 30, ChunkSize: 64 << 20}},
+		{
+			{MinObjectSize: 1 << 30, ChunkSize: 64 << 20},
+			{MinObjectSize: 10 << 30, ChunkSize: 128 << 20},
+		},
+	}
+
+	for i, tiers := range cases {
+		if err := validateChunkingTiers(tiers); err != nil {
+			t.Errorf("case[%d] unexpected error: %v", i, err)
+		}
+	}
+}
+
+// TestValidateChunkingTiers_Errors covers the rejection paths: tiny
+// chunk size, zero / negative min object size, unsorted thresholds,
+// and duplicate thresholds (caught by the strict-ascending rule).
+func TestValidateChunkingTiers_Errors(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name    string
+		tiers   []ChunkTier
+		wantErr string
+	}{
+		{
+			name: "chunk size below 1 MiB",
+			tiers: []ChunkTier{
+				{MinObjectSize: 1 << 30, ChunkSize: 1024},
+			},
+			wantErr: "chunk_size",
+		},
+		{
+			name: "zero min object size",
+			tiers: []ChunkTier{
+				{MinObjectSize: 0, ChunkSize: 64 << 20},
+			},
+			wantErr: "min_object_size",
+		},
+		{
+			name: "negative min object size",
+			tiers: []ChunkTier{
+				{MinObjectSize: -1, ChunkSize: 64 << 20},
+			},
+			wantErr: "min_object_size",
+		},
+		{
+			name: "unsorted ascending rejected",
+			tiers: []ChunkTier{
+				{MinObjectSize: 10 << 30, ChunkSize: 64 << 20},
+				{MinObjectSize: 1 << 30, ChunkSize: 128 << 20},
+			},
+			wantErr: "strictly ascending",
+		},
+		{
+			name: "duplicate min object size rejected",
+			tiers: []ChunkTier{
+				{MinObjectSize: 1 << 30, ChunkSize: 64 << 20},
+				{MinObjectSize: 1 << 30, ChunkSize: 128 << 20},
+			},
+			wantErr: "strictly ascending",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validateChunkingTiers(tt.tiers)
+			if err == nil {
+				t.Fatalf("expected error containing %q, got nil", tt.wantErr)
+			}
+
+			if !strings.Contains(err.Error(), tt.wantErr) {
+				t.Errorf("error %q does not contain %q", err.Error(), tt.wantErr)
+			}
+		})
+	}
+}
+
+// TestLoad_TiersAndReadahead drives validation through Load (full
+// YAML path) to ensure the tier rejection surfaces with the rich
+// error message and that an explicit readahead: 0 disables prefetch
+// (i.e. survives applyDefaults and is not bumped back to 8).
+func TestLoad_TiersAndReadahead(t *testing.T) {
+	t.Parallel()
+
+	t.Run("explicit_readahead_zero_preserved", func(t *testing.T) {
+		yaml := validAwss3YAML + "  readahead: 0\n"
+		path := writeTempYAML(t, yaml)
+
+		cfg, err := Load(path)
+		if err != nil {
+			t.Fatalf("Load: %v", err)
+		}
+
+		if cfg.Chunking.Readahead == nil {
+			t.Fatalf("Readahead should be non-nil after applyDefaults")
+		}
+
+		if *cfg.Chunking.Readahead != 0 {
+			t.Errorf("Readahead=%d want 0 (explicit disable preserved)", *cfg.Chunking.Readahead)
+		}
+
+		if d := cfg.Chunking.ReadaheadDepth(); d != 0 {
+			t.Errorf("ReadaheadDepth()=%d want 0", d)
+		}
+	})
+
+	t.Run("explicit_empty_tiers_preserved", func(t *testing.T) {
+		yaml := validAwss3YAML + "  tiers: []\n"
+		path := writeTempYAML(t, yaml)
+
+		cfg, err := Load(path)
+		if err != nil {
+			t.Fatalf("Load: %v", err)
+		}
+		// Tiers explicitly set to [] should survive applyDefaults
+		// (the default ladder must not overwrite operator intent).
+		if len(cfg.Chunking.Tiers) != 0 {
+			t.Errorf("Tiers=%v want []; applyDefaults overwrote explicit empty",
+				cfg.Chunking.Tiers)
+		}
+
+		if cfg.Chunking.AsChunkTiers() != nil {
+			t.Errorf("AsChunkTiers() returned non-nil for empty tiers")
+		}
+	})
+
+	t.Run("unsorted_tiers_rejected", func(t *testing.T) {
+		yaml := validAwss3YAML + `  tiers:
+    - min_object_size: 10737418240
+      chunk_size: 67108864
+    - min_object_size: 1073741824
+      chunk_size: 134217728
+`
+		path := writeTempYAML(t, yaml)
+
+		_, err := Load(path)
+		if err == nil {
+			t.Fatalf("Load accepted unsorted tiers")
+		}
+
+		if !strings.Contains(err.Error(), "strictly ascending") {
+			t.Errorf("error %q does not mention strict ascending order", err.Error())
+		}
+	})
+
+	t.Run("negative_readahead_rejected", func(t *testing.T) {
+		yaml := validAwss3YAML + "  readahead: -1\n"
+		path := writeTempYAML(t, yaml)
+
+		_, err := Load(path)
+		if err == nil {
+			t.Fatalf("Load accepted negative readahead")
+		}
+
+		if !strings.Contains(err.Error(), "chunking.readahead") {
+			t.Errorf("error %q does not mention chunking.readahead", err.Error())
+		}
+	})
+}
+
+// TestChunking_AsChunkTiers covers the config -> chunk.Tier mapping
+// preserves order and field values, and returns nil for empty.
+func TestChunking_AsChunkTiers(t *testing.T) {
+	t.Parallel()
+
+	c := Chunking{
+		Size: 8 << 20,
+		Tiers: []ChunkTier{
+			{MinObjectSize: 1 << 30, ChunkSize: 64 << 20},
+			{MinObjectSize: 10 << 30, ChunkSize: 128 << 20},
+		},
+	}
+
+	got := c.AsChunkTiers()
+	if len(got) != 2 {
+		t.Fatalf("len=%d want 2", len(got))
+	}
+
+	if got[0].MinObjectSize != 1<<30 || got[0].ChunkSize != 64<<20 {
+		t.Errorf("got[0]=%+v", got[0])
+	}
+
+	if got[1].MinObjectSize != 10<<30 || got[1].ChunkSize != 128<<20 {
+		t.Errorf("got[1]=%+v", got[1])
+	}
+
+	if (Chunking{}).AsChunkTiers() != nil {
+		t.Errorf("empty Chunking.AsChunkTiers() should be nil")
+	}
+}
+
+// TestParseLogLevel covers the orca log-level string -> slog.Level
+// mapping. Both empty and "info" map to LevelInfo so the YAML default
+// path matches the explicit-info path; "warn" and "warning" are
+// accepted equivalently. Unknown values return a descriptive error
+// so misconfiguration is surfaced rather than silently downgrading.
+func TestParseLogLevel(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		in      string
+		want    slog.Level
+		wantErr bool
+	}{
+		{"", slog.LevelInfo, false},
+		{"info", slog.LevelInfo, false},
+		{"INFO", slog.LevelInfo, false},
+		{"debug", slog.LevelDebug, false},
+		{" Debug ", slog.LevelDebug, false},
+		{"warn", slog.LevelWarn, false},
+		{"warning", slog.LevelWarn, false},
+		{"error", slog.LevelError, false},
+		{"trace", 0, true},
+		{"verbose", 0, true},
+		{"5", 0, true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.in, func(t *testing.T) {
+			got, err := ParseLogLevel(tt.in)
+			if tt.wantErr {
+				if err == nil {
+					t.Errorf("ParseLogLevel(%q) = %v, want error", tt.in, got)
+				}
+
+				return
+			}
+
+			if err != nil {
+				t.Errorf("ParseLogLevel(%q) unexpected err: %v", tt.in, err)
+				return
+			}
+
+			if got != tt.want {
+				t.Errorf("ParseLogLevel(%q) = %v, want %v", tt.in, got, tt.want)
+			}
+		})
+	}
+}
+
+// TestValidate_RejectsInvalidLogLevel verifies that an unrecognised
+// logging.level value is caught at config.Load time rather than at
+// process startup.
+func TestValidate_RejectsInvalidLogLevel(t *testing.T) {
+	t.Parallel()
+
+	yaml := validAwss3YAML + `
+logging:
+  level: trace
+`
+	path := writeTempYAML(t, yaml)
+
+	_, err := Load(path)
+	if err == nil {
+		t.Fatalf("Load accepted invalid logging.level: trace")
+	}
+
+	if !strings.Contains(err.Error(), "logging.level") {
+		t.Errorf("error does not mention logging.level: %v", err)
+	}
+}
+
+func writeTempYAML(t *testing.T, content string) string {
+	t.Helper()
+
+	dir := t.TempDir()
+	path := filepath.Join(dir, "config.yaml")
+
+	if err := os.WriteFile(path, []byte(content), 0o600); err != nil {
+		t.Fatalf("write temp yaml: %v", err)
+	}
+
+	return path
+}
+
+const validAwss3YAML = `
+server:
+  listen: 0.0.0.0:8443
+origin:
+  id: test-origin
+  driver: awss3
+  awss3:
+    endpoint: http://localstack:4566
+    region: us-east-1
+    bucket: orca-origin
+    access_key: test
+    secret_key: test
+    use_path_style: true
+cachestore:
+  driver: s3
+  s3:
+    endpoint: http://localstack:4566
+    bucket: orca-cache
+    region: us-east-1
+    access_key: test
+    secret_key: test
+    use_path_style: true
+cluster:
+  service: orca-peers.svc
+  self_pod_ip: 10.0.0.1
+  target_replicas: 3
+chunking:
+  size: 8388608
+`
diff --git a/internal/orca/fetch/fetch.go b/internal/orca/fetch/fetch.go
new file mode 100644
index 00000000..5b1865cc
--- /dev/null
+++ b/internal/orca/fetch/fetch.go
@@ -0,0 +1,563 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package fetch is the per-replica fill orchestrator: per-ChunkKey
+// singleflight, pre-header origin retry, per-replica origin
+// concurrency cap, and cross-replica fill via the cluster's internal
+// RPC.
+//
+// The dedup model is per-replica singleflight + cluster-wide dedup
+// via a rendezvous-hashed coordinator. No disk spool; joiners stream
+// from the leader's in-memory ring buffer.
+//
+// Pre-header retry: the coordinator may retry origin GETs up to the
+// budget in cfg.Origin.Retry until the first byte is committed to
+// the client response. Once headers are sent retries are not safe and
+// failures become mid-stream aborts.
+package fetch
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"sync"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/chunkcatalog"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/metadata"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// Coordinator orchestrates per-replica chunk fills.
+type Coordinator struct {
+	or  origin.Origin
+	cs  cachestore.CacheStore
+	cl  *cluster.Cluster
+	cat *chunkcatalog.Catalog
+	mc  *metadata.Cache
+	cfg *config.Config
+	log *slog.Logger
+
+	// Per-replica origin concurrency cap. Bounds in-flight
+	// Origin.GetRange calls to floor(target_global / target_replicas).
+	originSem chan struct{}
+
+	// Per-ChunkKey singleflight. Concurrent local fills for the same
+	// chunk collapse to one origin GetRange.
+	mu       sync.Mutex
+	inflight map[string]*fill
+}
+
+type fill struct {
+	done    chan struct{}
+	bodyBuf *bytes.Buffer // buffered chunk after fetch (in-memory, bounded by chunk size)
+	err     error
+}
+
+// NewCoordinator wires up the fetch coordinator. The log is used for
+// peer-fallback warnings and commit-after-serve failure traces, plus
+// debug-level tracing through every chunk-resolution decision point
+// when the operator enables logging.level: debug. The caller (usually
+// app.Start) injects the app-wide slog.Logger so fetch-path logs are
+// unified with the rest of the runtime's output. Passing nil falls
+// back to slog.Default().
+func NewCoordinator(
+	or origin.Origin,
+	cs cachestore.CacheStore,
+	cl *cluster.Cluster,
+	cat *chunkcatalog.Catalog,
+	mc *metadata.Cache,
+	cfg *config.Config,
+	log *slog.Logger,
+) *Coordinator {
+	tpr := cfg.TargetPerReplica()
+	if tpr < 1 {
+		tpr = 1
+	}
+
+	if log == nil {
+		log = slog.Default()
+	}
+
+	return &Coordinator{
+		or:        or,
+		cs:        cs,
+		cl:        cl,
+		cat:       cat,
+		mc:        mc,
+		cfg:       cfg,
+		log:       log,
+		originSem: make(chan struct{}, tpr),
+		inflight:  make(map[string]*fill),
+	}
+}
+
+// Origin returns the underlying origin (used by the LIST passthrough).
+func (c *Coordinator) Origin() origin.Origin { return c.or }
+
+// HeadObject returns object metadata, satisfying client HEAD requests.
+//
+// Rejects responses with an empty ETag via origin.MissingETagError.
+// chunk.Path encodes the ETag in its hash input; a stable cache key
+// requires the origin to supply one. Without an ETag, two different
+// versions of the same (bucket, key) would alias to the same
+// chunk.Path and serve stale bytes silently. The negative result is
+// cached at NegativeTTL so we do not re-Head a misconfigured origin
+// on every request.
+func (c *Coordinator) HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	c.log.LogAttrs(ctx, slog.LevelDebug, "head_object",
+		slog.String("origin_id", c.cfg.Origin.ID),
+		slog.String("bucket", bucket),
+		slog.String("key", key),
+	)
+
+	return c.mc.LookupOrFetch(ctx, c.cfg.Origin.ID, bucket, key,
+		func(ctx context.Context) (origin.ObjectInfo, error) {
+			info, err := c.or.Head(ctx, bucket, key)
+			if err != nil {
+				return info, err
+			}
+
+			if info.ETag == "" {
+				return info, &origin.MissingETagError{Bucket: bucket, Key: key}
+			}
+
+			return info, nil
+		})
+}
+
+// GetChunk returns a reader over the chunk's bytes, fulfilling either
+// from CacheStore (hit) or by orchestrating a cluster-wide
+// dedup'd fill (miss).
+//
+// objectSize is the authoritative size of the object the chunk
+// belongs to (from origin Head). It is used to clamp the cachestore
+// read length and to size the tail chunk correctly on a miss.
+//
+// On miss:
+//   - If self is the coordinator: run local fill (origin GET via retry,
+//     atomic commit to CacheStore, populate buffer for joiners).
+//   - If a peer is the coordinator: send /internal/fill to that peer;
+//     stream from peer's response. On 409 Conflict, fall back to local
+//     fill.
+func (c *Coordinator) GetChunk(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) {
+	c.log.LogAttrs(ctx, slog.LevelDebug, "get_chunk",
+		chunkAttrs(k),
+		slog.Int64("object_size", objectSize),
+		slog.Int64("expected_len", k.ExpectedLen(objectSize)),
+	)
+
+	if rc, hit, err := c.lookupOrStat(ctx, k, objectSize); err != nil {
+		return nil, err
+	} else if hit {
+		return rc, nil
+	}
+
+	// Cluster-wide dedup: route to coordinator.
+	coord := c.cl.Coordinator(k)
+
+	c.log.LogAttrs(ctx, slog.LevelDebug, "coordinator_selected",
+		chunkAttrs(k),
+		slog.String("coord_ip", coord.IP),
+		slog.Bool("is_self", coord.Self),
+	)
+
+	if !coord.Self {
+		c.log.LogAttrs(ctx, slog.LevelDebug, "peer_fill_attempt",
+			chunkAttrs(k),
+			slog.String("peer_ip", coord.IP),
+		)
+
+		rc, err := c.cl.FillFromPeer(ctx, coord, k, objectSize)
+		if err == nil {
+			c.log.LogAttrs(ctx, slog.LevelDebug, "peer_fill_success",
+				chunkAttrs(k),
+				slog.String("peer_ip", coord.IP),
+			)
+
+			return rc, nil
+		}
+
+		if errors.Is(err, cluster.ErrPeerNotCoordinator) {
+			c.log.LogAttrs(ctx, slog.LevelWarn, "peer reported not-coordinator; falling back to local fill",
+				chunkAttrs(k),
+				slog.String("peer_ip", coord.IP),
+			)
+			// fall through to local fill
+		} else {
+			c.log.LogAttrs(ctx, slog.LevelWarn, "internal-fill RPC failed; falling back to local fill",
+				chunkAttrs(k),
+				slog.String("peer_ip", coord.IP),
+				slog.Any("err", err),
+			)
+		}
+	}
+
+	return c.fillLocal(ctx, k, objectSize)
+}
+
+// FillForPeer is the path taken by the /internal/fill handler.
+//
+// The receiver becomes the leader for this fill (or joins an in-flight
+// fill for the same key). Returns a streaming body of the entire chunk.
+func (c *Coordinator) FillForPeer(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) {
+	c.log.LogAttrs(ctx, slog.LevelDebug, "fill_for_peer",
+		chunkAttrs(k),
+		slog.Int64("object_size", objectSize),
+	)
+
+	if rc, hit, err := c.lookupOrStat(ctx, k, objectSize); err != nil {
+		return nil, err
+	} else if hit {
+		return rc, nil
+	}
+
+	return c.fillLocal(ctx, k, objectSize)
+}
+
+// lookupOrStat is the shared catalog-hit / cachestore-stat probe used
+// by both GetChunk and FillForPeer. Returns (body, true, nil) when a
+// pre-existing chunk is found, (nil, false, nil) on a clean miss
+// (caller should run the appropriate fill path), or (nil, false, err)
+// for non-recoverable cachestore errors.
+//
+// On a catalog hit that turns out to be stale (cachestore returns
+// ErrNotFound), the catalog entry is forgotten so the next call
+// re-stats fresh.
+func (c *Coordinator) lookupOrStat(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, bool, error) {
+	expected := k.ExpectedLen(objectSize)
+
+	if c.cat.Lookup(k) {
+		c.log.LogAttrs(ctx, slog.LevelDebug, "catalog_hit",
+			chunkAttrs(k),
+		)
+
+		rc, err := c.cs.GetChunk(ctx, k, 0, expected)
+		if err == nil {
+			return rc, true, nil
+		}
+
+		if errors.Is(err, cachestore.ErrNotFound) {
+			c.log.LogAttrs(ctx, slog.LevelDebug, "catalog_stale_forgotten",
+				chunkAttrs(k),
+			)
+			c.cat.Forget(k)
+			// fall through to stat
+		} else {
+			return nil, false, err
+		}
+	}
+
+	info, err := c.cs.Stat(ctx, k)
+	if err != nil {
+		if errors.Is(err, cachestore.ErrNotFound) {
+			c.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_stat_miss",
+				chunkAttrs(k),
+			)
+
+			return nil, false, nil
+		}
+
+		return nil, false, err
+	}
+
+	c.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_stat_hit",
+		chunkAttrs(k),
+		slog.Int64("size", info.Size),
+	)
+
+	c.cat.Record(k)
+
+	// Trust the stat's reported size if it disagrees with our
+	// expectation (e.g. older committed entry from before a chunk
+	// size change), but clamp to the expected length so a corrupt
+	// larger stat does not leak bytes past the object end.
+	readLen := info.Size
+	if expected > 0 && readLen > expected {
+		readLen = expected
+	}
+
+	rc, err := c.cs.GetChunk(ctx, k, 0, readLen)
+	if err != nil {
+		return nil, false, err
+	}
+
+	return rc, true, nil
+}
+
+// fillLocal runs (or joins) the singleflight for k on this replica.
+func (c *Coordinator) fillLocal(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) {
+	path := k.Path()
+
+	c.mu.Lock()
+
+	f, ok := c.inflight[path]
+	if !ok {
+		f = &fill{done: make(chan struct{})}
+		c.inflight[path] = f
+		c.mu.Unlock()
+
+		c.log.LogAttrs(ctx, slog.LevelDebug, "fill_local_lead",
+			chunkAttrs(k),
+		)
+
+		go c.runFill(k, objectSize, f)
+	} else {
+		c.mu.Unlock()
+		c.log.LogAttrs(ctx, slog.LevelDebug, "fill_local_join",
+			chunkAttrs(k),
+		)
+	}
+
+	select {
+	case <-ctx.Done():
+		return nil, ctx.Err()
+	case <-f.done:
+	}
+
+	if f.err != nil {
+		return nil, f.err
+	}
+
+	return io.NopCloser(bytes.NewReader(f.bodyBuf.Bytes())), nil
+}
+
+func (c *Coordinator) runFill(k chunk.Key, objectSize int64, f *fill) {
+	// runFill runs on a fill-scoped detached context (not the
+	// caller's) so it can complete the cachestore commit step even
+	// if the originating client disconnects mid-stream. The 5-minute
+	// ceiling bounds the cost: a fill no joiner ever reads still
+	// releases its origin-semaphore slot and clears its inflight
+	// entry within the budget. Peak per-fill heap is one ChunkSize
+	// bytes.Buffer (8 MiB default).
+	//
+	// Commit-after-serve ordering: once the origin body is fully
+	// fetched and validated, joiners are released (close(f.done))
+	// BEFORE the PutChunk RPC begins. This shaves joiner latency by
+	// the cachestore commit time on the cold-fill path: joiners get
+	// bytes as soon as origin delivered them, and the commit runs in
+	// parallel from the joiners' perspective. Correctness is
+	// preserved because the buffer is fully populated and
+	// length-validated before release; PutChunk reads buf.Bytes()
+	// concurrently with joiner reads, but bytes.Buffer is never
+	// mutated after the final io.Copy returns, so the underlying
+	// byte slice is effectively immutable and safe for concurrent
+	// reads.
+	//
+	// release() is sync.Once-wrapped so close(f.done) fires exactly
+	// once whether via the explicit success-path call or the deferred
+	// safety net (which catches panic paths).
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+
+	var releaseOnce sync.Once
+
+	release := func() {
+		releaseOnce.Do(func() { close(f.done) })
+	}
+
+	defer func() {
+		release()
+		c.mu.Lock()
+		delete(c.inflight, k.Path())
+		c.mu.Unlock()
+	}()
+
+	// Acquire per-replica origin slot.
+	queueCtx, queueCancel := context.WithTimeout(ctx, c.cfg.Origin.QueueTimeout)
+	defer queueCancel()
+
+	select {
+	case c.originSem <- struct{}{}:
+	case <-queueCtx.Done():
+		f.err = fmt.Errorf("origin: queue timeout (cap=%d)", cap(c.originSem))
+		return
+	}
+
+	defer func() { <-c.originSem }()
+
+	c.log.LogAttrs(ctx, slog.LevelDebug, "origin_slot_acquired",
+		chunkAttrs(k),
+		slog.Int("slot_cap", cap(c.originSem)),
+	)
+
+	// expectedLen is the authoritative number of bytes we should
+	// receive from origin: ChunkSize for non-tail chunks, the
+	// remainder for the tail. Production callers always supply a
+	// known objectSize, so expectedLen > 0; the wire format
+	// (DecodeChunkKey) and edge handler both reject the
+	// objectSize == 0 case at their boundaries, so the validation
+	// below is always exercised.
+	expectedLen := k.ExpectedLen(objectSize)
+	off := k.Index * k.ChunkSize
+
+	body, err := c.fetchWithRetry(ctx, k, off, expectedLen)
+	if err != nil {
+		f.err = err
+		return
+	}
+	defer body.Close() //nolint:errcheck // origin body close best-effort
+
+	buf := &bytes.Buffer{}
+	if _, err := io.Copy(buf, body); err != nil {
+		f.err = fmt.Errorf("fill copy: %w", err)
+		return
+	}
+
+	c.log.LogAttrs(ctx, slog.LevelDebug, "origin_body_received",
+		chunkAttrs(k),
+		slog.Int("bytes", buf.Len()),
+		slog.Int64("expected_len", expectedLen),
+	)
+
+	if int64(buf.Len()) != expectedLen {
+		f.err = fmt.Errorf("origin returned %d bytes, expected %d (chunk=%s)",
+			buf.Len(), expectedLen, k.String())
+
+		return
+	}
+
+	f.bodyBuf = buf
+
+	// Release joiners BEFORE the PutChunk commit. Joiners' reads of
+	// f.bodyBuf.Bytes() are safe to overlap with the PutChunk RPC's
+	// read of the same slice: bytes.Buffer's internal slice is no
+	// longer mutated after io.Copy returned above.
+	release()
+
+	// Atomic commit to CacheStore (asynchronous from joiners'
+	// perspective; they have their bytes already).
+	commitErr := c.cs.PutChunk(ctx, k, int64(buf.Len()), bytes.NewReader(buf.Bytes()))
+
+	switch {
+	case commitErr == nil:
+		c.cat.Record(k)
+		c.log.LogAttrs(ctx, slog.LevelDebug, "commit_success",
+			chunkAttrs(k),
+			slog.Int("bytes", buf.Len()),
+		)
+	case errors.Is(commitErr, cachestore.ErrCommitLost):
+		// Another replica won; treat existing CacheStore entry as truth.
+		c.log.LogAttrs(ctx, slog.LevelDebug, "commit_lost",
+			chunkAttrs(k),
+		)
+
+		if _, err := c.cs.Stat(ctx, k); err == nil {
+			c.cat.Record(k)
+		} else {
+			// Stat failed after a lost commit: cachestore is likely
+			// unhealthy (transient or otherwise). Catalog stays
+			// unrecorded (next request refills), but log so operators
+			// can see cachestore flapping.
+			c.log.LogAttrs(ctx, slog.LevelDebug, "commit_lost_stat_failed",
+				chunkAttrs(k),
+				slog.Any("err", err),
+			)
+		}
+	default:
+		c.log.LogAttrs(ctx, slog.LevelWarn, "commit-after-serve failed",
+			chunkAttrs(k),
+			slog.Any("err", commitErr),
+		)
+		// Don't record in catalog; next request refills.
+	}
+}
+
+func (c *Coordinator) fetchWithRetry(ctx context.Context, k chunk.Key, off, length int64) (io.ReadCloser, error) {
+	deadline := time.Now().Add(c.cfg.Origin.Retry.MaxTotalDuration)
+	backoff := c.cfg.Origin.Retry.BackoffInitial
+
+	var lastErr error
+
+	for attempt := 1; attempt <= c.cfg.Origin.Retry.Attempts; attempt++ {
+		if err := ctx.Err(); err != nil {
+			return nil, err
+		}
+
+		if time.Now().After(deadline) {
+			return nil, fmt.Errorf("origin retry exhausted (duration); last err: %w", lastErr)
+		}
+
+		c.log.LogAttrs(ctx, slog.LevelDebug, "origin_get_range_attempt",
+			chunkAttrs(k),
+			slog.Int("attempt", attempt),
+			slog.Int64("off", off),
+			slog.Int64("length", length),
+		)
+
+		body, err := c.or.GetRange(ctx, k.Bucket, k.ObjectKey, k.ETag, off, length)
+		if err == nil {
+			c.log.LogAttrs(ctx, slog.LevelDebug, "origin_get_range_ok",
+				chunkAttrs(k),
+				slog.Int("attempt", attempt),
+			)
+
+			return body, nil
+		}
+
+		lastErr = err
+		// Non-retryable: ETag changed.
+		var etagChanged *origin.OriginETagChangedError
+		if errors.As(err, &etagChanged) {
+			c.log.LogAttrs(ctx, slog.LevelDebug, "origin_etag_changed",
+				chunkAttrs(k),
+				slog.Int("attempt", attempt),
+			)
+			c.mc.Invalidate(c.cfg.Origin.ID, k.Bucket, k.ObjectKey)
+
+			return nil, err
+		}
+		// Non-retryable: not found.
+		if errors.Is(err, origin.ErrNotFound) {
+			c.log.LogAttrs(ctx, slog.LevelDebug, "origin_not_found",
+				chunkAttrs(k),
+				slog.Int("attempt", attempt),
+			)
+
+			return nil, err
+		}
+
+		c.log.LogAttrs(ctx, slog.LevelDebug, "origin_retryable_error",
+			chunkAttrs(k),
+			slog.Int("attempt", attempt),
+			slog.Any("err", err),
+			slog.Duration("next_backoff", backoff),
+		)
+		// Backoff.
+		if attempt < c.cfg.Origin.Retry.Attempts {
+			select {
+			case <-ctx.Done():
+				return nil, ctx.Err()
+			case <-time.After(backoff):
+			}
+
+			backoff *= 2
+			if backoff > c.cfg.Origin.Retry.BackoffMax {
+				backoff = c.cfg.Origin.Retry.BackoffMax
+			}
+		}
+	}
+
+	return nil, fmt.Errorf("origin retry exhausted (attempts); last err: %w", lastErr)
+}
+
+// chunkAttrs returns a slog.Attr group identifying the chunk by its
+// (origin, bucket, key, index) tuple. Used at every fetch-path log
+// callsite for consistent grep / filter syntax across emissions.
+// ETag is intentionally not surfaced here - log it via slog.String
+// where needed using the chunk.Key's truncated String() form.
+func chunkAttrs(k chunk.Key) slog.Attr {
+	return slog.Group("chunk",
+		slog.String("origin_id", k.OriginID),
+		slog.String("bucket", k.Bucket),
+		slog.String("key", k.ObjectKey),
+		slog.Int64("index", k.Index),
+	)
+}
diff --git a/internal/orca/fetch/fetch_test.go b/internal/orca/fetch/fetch_test.go
new file mode 100644
index 00000000..617d5eab
--- /dev/null
+++ b/internal/orca/fetch/fetch_test.go
@@ -0,0 +1,450 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package fetch
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"io"
+	"log/slog"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/chunkcatalog"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/metadata"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// TestNewCoordinator_UsesInjectedLogger verifies the constructor
+// stores the provided slog.Logger on the Coordinator. The peer-RPC
+// fallback warnings and commit-after-serve failure traces emitted
+// from the fetch path must flow through this logger rather than
+// slog.Default(), so operators can route fetch logs alongside the
+// rest of the app's structured output.
+func TestNewCoordinator_UsesInjectedLogger(t *testing.T) {
+	t.Parallel()
+
+	injected := slog.New(slog.NewTextHandler(io.Discard, nil))
+	c := NewCoordinator(nil, nil, nil, nil, nil, &config.Config{}, injected)
+
+	if c.log != injected {
+		t.Errorf("Coordinator.log not the injected logger")
+	}
+}
+
+// TestNewCoordinator_NilLoggerFallsBackToDefault locks the contract
+// that a nil logger falls back to slog.Default() rather than panicking
+// during peer fallback or commit-after-serve.
+func TestNewCoordinator_NilLoggerFallsBackToDefault(t *testing.T) {
+	t.Parallel()
+
+	c := NewCoordinator(nil, nil, nil, nil, nil, &config.Config{}, nil)
+	if c.log == nil {
+		t.Errorf("nil logger should have fallen back to slog.Default()")
+	}
+}
+
+// TestChunkAttrs_GroupShape locks the slog attribute taxonomy used
+// by every fetch-path emission. The 'chunk' group must contain the
+// (origin_id, bucket, key, index) identifying tuple so operator
+// queries can grep on a single, consistent attribute path.
+func TestChunkAttrs_GroupShape(t *testing.T) {
+	t.Parallel()
+
+	var buf bytes.Buffer
+
+	log := slog.New(slog.NewTextHandler(&buf,
+		&slog.HandlerOptions{Level: slog.LevelDebug}))
+
+	log.LogAttrs(context.Background(), slog.LevelDebug, "probe", chunkAttrs(chunk.Key{
+		OriginID:  "origin-x",
+		Bucket:    "bkt",
+		ObjectKey: "obj",
+		ChunkSize: 1024,
+		Index:     7,
+	}))
+
+	out := buf.String()
+	for _, want := range []string{
+		"chunk.origin_id=origin-x",
+		"chunk.bucket=bkt",
+		"chunk.key=obj",
+		"chunk.index=7",
+	} {
+		if !strings.Contains(out, want) {
+			t.Errorf("chunkAttrs output missing %q; got %q", want, out)
+		}
+	}
+}
+
+// TestCoordinator_DebugEmissionsAtDebugLevel exercises a sample of
+// the fetch-path debug emissions and asserts they reach the
+// handler. We cannot drive the full GetChunk path here without
+// standing up the entire dependency graph, so we exercise the
+// representative log statements directly. The contract under test
+// is that the call sites use LogAttrs at Debug level (so zero-cost
+// at Info+) and emit the standardized 'chunk' attribute group.
+func TestCoordinator_DebugEmissionsAtDebugLevel(t *testing.T) {
+	t.Parallel()
+
+	var buf bytes.Buffer
+
+	log := slog.New(slog.NewTextHandler(&buf,
+		&slog.HandlerOptions{Level: slog.LevelDebug}))
+	c := &Coordinator{log: log}
+
+	k := chunk.Key{
+		OriginID:  "ox",
+		Bucket:    "bkt",
+		ObjectKey: "obj",
+		ChunkSize: 1024,
+		Index:     3,
+	}
+	// Sample emissions corresponding to lookupOrStat hits,
+	// peer-fill route selection, and commit success.
+	c.log.LogAttrs(context.Background(), slog.LevelDebug, "catalog_hit", chunkAttrs(k))
+	c.log.LogAttrs(context.Background(), slog.LevelDebug, "peer_fill_attempt",
+		chunkAttrs(k), slog.String("peer_ip", "10.0.0.5"))
+	c.log.LogAttrs(context.Background(), slog.LevelDebug, "commit_success",
+		chunkAttrs(k), slog.Int("bytes", 1024))
+
+	out := buf.String()
+	for _, want := range []string{"catalog_hit", "peer_fill_attempt", "commit_success", "chunk.index=3"} {
+		if !strings.Contains(out, want) {
+			t.Errorf("expected %q in debug output; got %q", want, out)
+		}
+	}
+}
+
+// TestCoordinator_DebugFilteredAtInfo verifies that the standard
+// LogAttrs path emits nothing when the handler is configured above
+// Debug. This is the operational expectation: enabling Info-level
+// logging silences the per-chunk traces entirely so production
+// throughput is not affected by log overhead.
+func TestCoordinator_DebugFilteredAtInfo(t *testing.T) {
+	t.Parallel()
+
+	var buf bytes.Buffer
+
+	log := slog.New(slog.NewTextHandler(&buf,
+		&slog.HandlerOptions{Level: slog.LevelInfo}))
+	c := &Coordinator{log: log}
+
+	k := chunk.Key{OriginID: "ox", Bucket: "b", ObjectKey: "o", ChunkSize: 1024, Index: 0}
+	c.log.LogAttrs(context.Background(), slog.LevelDebug, "catalog_hit", chunkAttrs(k))
+
+	if buf.Len() != 0 {
+		t.Errorf("debug emission leaked through Info-level handler: %q", buf.String())
+	}
+}
+
+// TestCoordinator_WarnRoutesThroughInjectedHandler verifies that the
+// (migrated to LogAttrs) commit-after-serve warning still surfaces
+// at Warn level on the injected logger. Regression test for the
+// existing call site that pre-dates the debug emissions.
+func TestCoordinator_WarnRoutesThroughInjectedHandler(t *testing.T) {
+	t.Parallel()
+
+	var buf bytes.Buffer
+
+	log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelWarn}))
+	c := &Coordinator{log: log}
+
+	k := chunk.Key{OriginID: "ox", Bucket: "b", ObjectKey: "o", ChunkSize: 1024, Index: 0}
+	c.log.LogAttrs(context.Background(), slog.LevelWarn, "commit-after-serve failed",
+		chunkAttrs(k),
+		slog.String("err", "stub put failure"),
+	)
+
+	out := buf.String()
+	if !strings.Contains(out, "commit-after-serve failed") {
+		t.Errorf("warning not captured; got %q", out)
+	}
+
+	if !strings.Contains(out, "chunk.key=o") {
+		t.Errorf("chunk attribute missing; got %q", out)
+	}
+}
+
+// fakeOriginForFill returns a fixed body for any GetRange call.
+type fakeOriginForFill struct {
+	body []byte
+}
+
+func (f *fakeOriginForFill) Head(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+	return origin.ObjectInfo{Size: int64(len(f.body)), ETag: "e1"}, nil
+}
+
+func (f *fakeOriginForFill) GetRange(_ context.Context, _, _, _ string, _, _ int64) (io.ReadCloser, error) {
+	return io.NopCloser(bytes.NewReader(f.body)), nil
+}
+
+func (f *fakeOriginForFill) List(_ context.Context, _, _, _ string, _ int) (origin.ListResult, error) {
+	return origin.ListResult{}, nil
+}
+
+// slowPutCacheStore implements cachestore.CacheStore. PutChunk
+// blocks until putGate is closed; signals putStarted when entered
+// and putReturned when leaving. Used by the commit-after-serve test
+// to observe the relative ordering of joiner release vs PutChunk
+// completion.
+type slowPutCacheStore struct {
+	putGate      chan struct{}
+	putStarted   chan struct{}
+	putReturned  chan struct{}
+	closeOnce    sync.Once
+	putCallCount atomic.Int64
+}
+
+func newSlowPutCacheStore() *slowPutCacheStore {
+	return &slowPutCacheStore{
+		putGate:     make(chan struct{}),
+		putStarted:  make(chan struct{}),
+		putReturned: make(chan struct{}),
+	}
+}
+
+func (s *slowPutCacheStore) GetChunk(_ context.Context, _ chunk.Key, _, _ int64) (io.ReadCloser, error) {
+	return nil, cachestore.ErrNotFound
+}
+
+func (s *slowPutCacheStore) PutChunk(_ context.Context, _ chunk.Key, _ int64, _ io.Reader) error {
+	s.putCallCount.Add(1)
+	s.closeOnce.Do(func() { close(s.putStarted) })
+	<-s.putGate
+	close(s.putReturned)
+
+	return nil
+}
+
+func (s *slowPutCacheStore) Stat(_ context.Context, _ chunk.Key) (cachestore.Info, error) {
+	return cachestore.Info{}, cachestore.ErrNotFound
+}
+
+func (s *slowPutCacheStore) Delete(_ context.Context, _ chunk.Key) error  { return nil }
+func (s *slowPutCacheStore) SelfTestAtomicCommit(_ context.Context) error { return nil }
+
+// TestRunFill_CommitAfterServe_JoinerSeesBytesBeforeCommit verifies
+// that runFill releases joiners (close(f.done)) BEFORE the cachestore
+// PutChunk completes. With the prior commit-before-serve ordering,
+// joiners had to wait an extra commit-rtt; this test detects a
+// regression by asserting the joiner returns while PutChunk is still
+// blocked.
+//
+// Regression for H-1.
+func TestRunFill_CommitAfterServe_JoinerSeesBytesBeforeCommit(t *testing.T) {
+	t.Parallel()
+
+	payload := []byte("hello world commit-after-serve test payload!!")
+	chunkSize := int64(len(payload))
+
+	or := &fakeOriginForFill{body: payload}
+	cs := newSlowPutCacheStore()
+	cat := chunkcatalog.New(64, slog.New(slog.NewTextHandler(io.Discard, nil)))
+	mc := metadata.NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil)
+
+	cfg := &config.Config{
+		Origin: config.Origin{
+			ID:           "ox",
+			QueueTimeout: time.Second,
+			Retry: config.OriginRetry{
+				Attempts:         1,
+				BackoffInitial:   time.Millisecond,
+				BackoffMax:       time.Millisecond,
+				MaxTotalDuration: time.Second,
+			},
+			TargetGlobal: 4,
+		},
+		Cluster: config.Cluster{TargetReplicas: 1},
+	}
+
+	co := NewCoordinator(or, cs, nil, cat, mc, cfg, slog.New(slog.NewTextHandler(io.Discard, nil)))
+
+	k := chunk.Key{
+		OriginID:  "ox",
+		Bucket:    "b",
+		ObjectKey: "o",
+		ETag:      "e1",
+		ChunkSize: chunkSize,
+		Index:     0,
+	}
+
+	rcCh := make(chan io.ReadCloser, 1)
+	errCh := make(chan error, 1)
+
+	go func() {
+		rc, err := co.fillLocal(context.Background(), k, chunkSize)
+		if err != nil {
+			errCh <- err
+			return
+		}
+
+		rcCh <- rc
+	}()
+	// Wait for PutChunk to have been entered, ensuring runFill is
+	// past the validate-and-release point.
+	select {
+	case <-cs.putStarted:
+	case <-time.After(2 * time.Second):
+		close(cs.putGate)
+		t.Fatalf("PutChunk never entered; runFill never reached commit")
+	}
+
+	// fillLocal should return now (joiner released before PutChunk
+	// completes). With the old commit-before-serve ordering it would
+	// still be blocked.
+	select {
+	case rc := <-rcCh:
+		// Verify PutChunk hasn't completed.
+		select {
+		case <-cs.putReturned:
+			t.Errorf("PutChunk returned before fillLocal; commit-after-serve regressed")
+		default:
+		}
+
+		got, err := io.ReadAll(rc)
+		if err != nil {
+			t.Errorf("read body: %v", err)
+		}
+
+		if !bytes.Equal(got, payload) {
+			t.Errorf("body mismatch: got %d bytes want %d", len(got), len(payload))
+		}
+
+		_ = rc.Close() //nolint:errcheck // test cleanup
+	case err := <-errCh:
+		close(cs.putGate)
+		t.Fatalf("fillLocal err: %v", err)
+	case <-time.After(2 * time.Second):
+		close(cs.putGate)
+		t.Fatalf("fillLocal didn't return while PutChunk was blocked; commit-after-serve regressed")
+	}
+
+	// Release PutChunk and let runFill finish.
+	close(cs.putGate)
+	<-cs.putReturned
+}
+
+// TestRunFill_ReleaseIdempotent_PanicSafe verifies that close(f.done)
+// fires exactly once whether via the explicit success-path call or
+// the deferred safety net. A panic mid-fill must not corrupt the
+// channel state by double-closing it.
+//
+// Regression for H-1's sync.Once safety property.
+func TestRunFill_ReleaseIdempotent_PanicSafe(t *testing.T) {
+	t.Parallel()
+
+	// Use the test pattern directly: a sync.Once-wrapped close,
+	// called from two paths.
+	done := make(chan struct{})
+
+	var once sync.Once
+
+	release := func() { once.Do(func() { close(done) }) }
+
+	release() // explicit path
+	release() // simulated "deferred safety net" path - must not panic
+
+	select {
+	case <-done:
+		// Closed - good.
+	default:
+		t.Errorf("done channel not closed after release()")
+	}
+}
+
+// stubOriginEmptyETag returns ObjectInfo with no ETag - simulating a
+// misconfigured origin (e.g. some S3-compatible backend without
+// versioning, or a custom origin not following the AWS/Azure
+// contract).
+type stubOriginEmptyETag struct{}
+
+func (stubOriginEmptyETag) Head(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+	return origin.ObjectInfo{Size: 1024, ETag: ""}, nil
+}
+
+func (stubOriginEmptyETag) GetRange(_ context.Context, _, _, _ string, _, _ int64) (io.ReadCloser, error) {
+	return nil, nil
+}
+
+func (stubOriginEmptyETag) List(_ context.Context, _, _, _ string, _ int) (origin.ListResult, error) {
+	return origin.ListResult{}, nil
+}
+
+// TestHeadObject_RejectsEmptyETag verifies that the coordinator
+// rejects an origin Head response with an empty ETag. chunk.Path
+// encodes the ETag in its hash; without it, two different versions
+// of the same (bucket, key) would alias and serve stale bytes
+// silently.
+//
+// Regression for H-7.
+func TestHeadObject_RejectsEmptyETag(t *testing.T) {
+	t.Parallel()
+
+	mc := metadata.NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil)
+	co := NewCoordinator(stubOriginEmptyETag{}, nil, nil, nil, mc,
+		&config.Config{Origin: config.Origin{ID: "ox"}, Cluster: config.Cluster{TargetReplicas: 1}},
+		slog.New(slog.NewTextHandler(io.Discard, nil)))
+
+	_, err := co.HeadObject(context.Background(), "b", "o")
+	if err == nil {
+		t.Fatalf("HeadObject accepted empty ETag; want MissingETagError")
+	}
+
+	var mte *origin.MissingETagError
+	if !errors.As(err, &mte) {
+		t.Errorf("err type = %T (want *origin.MissingETagError): %v", err, err)
+	}
+}
+
+// TestHeadObject_EmptyETag_CachedNegatively verifies that a second
+// HeadObject call after a MissingETagError result does NOT re-hit
+// the origin: the negative result must be cached so we do not
+// hammer a misconfigured origin on every request.
+func TestHeadObject_EmptyETag_CachedNegatively(t *testing.T) {
+	t.Parallel()
+
+	or := &countingOrigin{inner: stubOriginEmptyETag{}}
+	mc := metadata.NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil)
+	co := NewCoordinator(or, nil, nil, nil, mc,
+		&config.Config{Origin: config.Origin{ID: "ox"}, Cluster: config.Cluster{TargetReplicas: 1}},
+		slog.New(slog.NewTextHandler(io.Discard, nil)))
+
+	for i := 0; i < 3; i++ {
+		_, err := co.HeadObject(context.Background(), "b", "o")
+		if err == nil {
+			t.Errorf("call %d: HeadObject accepted empty ETag", i)
+		}
+	}
+
+	if got := or.headCalls.Load(); got != 1 {
+		t.Errorf("origin.Head invoked %d times; want 1 (negative cached)", got)
+	}
+}
+
+// countingOrigin wraps an origin.Origin and counts Head invocations.
+type countingOrigin struct {
+	inner     origin.Origin
+	headCalls atomic.Int64
+}
+
+func (c *countingOrigin) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	c.headCalls.Add(1)
+	return c.inner.Head(ctx, bucket, key)
+}
+
+func (c *countingOrigin) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) {
+	return c.inner.GetRange(ctx, bucket, key, etag, off, n)
+}
+
+func (c *countingOrigin) List(ctx context.Context, bucket, prefix, marker string, max int) (origin.ListResult, error) {
+	return c.inner.List(ctx, bucket, prefix, marker, max)
+}
diff --git a/internal/orca/inttest/azure_test.go b/internal/orca/inttest/azure_test.go
new file mode 100644
index 00000000..5c9ab1dd
--- /dev/null
+++ b/internal/orca/inttest/azure_test.go
@@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"bytes"
+	"context"
+	"net/http"
+	"testing"
+	"time"
+)
+
+// TestAzureBlobOrigin_ColdGet verifies the azureblob origin driver
+// works against Azurite end-to-end on a 3-replica cluster. The
+// MediumBlob spans 2 chunks so rendezvous-hashed routing typically
+// exercises both fillLocal and FillFromPeer in a single run.
+func TestAzureBlobOrigin_ColdGet(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second)
+	defer cancel()
+
+	ctr := pkgAzurite.NewContainer(ctx, t, "orca-origin")
+	blob := MediumBlob()
+	SeedAzure(ctx, t, pkgAzurite, ctr, []SeedBlob{blob})
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:     pkgLocalStack,
+		Azurite:        pkgAzurite,
+		OriginDriver:   "azureblob",
+		AzureContainer: ctr,
+	})
+
+	resp := cl.Get(1).HTTP.Get(ctx, t, ctr, blob.Key)
+	if resp.Status != http.StatusOK {
+		t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body))
+	}
+
+	if !bytes.Equal(resp.Body, blob.Data) {
+		t.Fatalf("body mismatch: got %d bytes want %d", len(resp.Body), len(blob.Data))
+	}
+}
diff --git a/internal/orca/inttest/azurite.go b/internal/orca/inttest/azurite.go
new file mode 100644
index 00000000..451f81ec
--- /dev/null
+++ b/internal/orca/inttest/azurite.go
@@ -0,0 +1,169 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"fmt"
+	"testing"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/pageblob"
+	"github.com/testcontainers/testcontainers-go"
+	"github.com/testcontainers/testcontainers-go/wait"
+)
+
+// Azurite is a running Azurite container with helper accessors for
+// constructing azblob clients pointed at the well-known dev account.
+type Azurite struct {
+	container testcontainers.Container
+	endpoint  string // http://host:port/devstoreaccount1
+}
+
+// Endpoint returns the Azurite blob-service URL including the
+// devstoreaccount1 path segment.
+func (az *Azurite) Endpoint() string { return az.endpoint }
+
+// AccountName returns the well-known Azurite dev account name.
+func (az *Azurite) AccountName() string { return azuriteAccountName }
+
+// AccountKey returns the well-known Azurite dev account key.
+func (az *Azurite) AccountKey() string { return azuriteAccountKey }
+
+// StartAzurite launches an Azurite container and returns once the
+// blob-service port is reachable. Caller terminates via Terminate or
+// t.Cleanup.
+func StartAzurite(ctx context.Context) (*Azurite, error) {
+	req := testcontainers.ContainerRequest{
+		Image:        azuriteImage,
+		ExposedPorts: []string{azuritePort + "/tcp"},
+		// `azurite-blob` listens on 0.0.0.0 by default; --skipApiVersionCheck
+		// keeps the SDK happy for newer client versions.
+		Cmd:        []string{"azurite-blob", "--blobHost", "0.0.0.0", "--skipApiVersionCheck"},
+		WaitingFor: wait.ForListeningPort(azuritePort + "/tcp"),
+	}
+
+	c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
+		ContainerRequest: req,
+		Started:          true,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("start azurite: %w", err)
+	}
+
+	host, err := c.Host(ctx)
+	if err != nil {
+		_ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup
+		return nil, fmt.Errorf("azurite host: %w", err)
+	}
+
+	port, err := c.MappedPort(ctx, azuritePort+"/tcp")
+	if err != nil {
+		_ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup
+		return nil, fmt.Errorf("azurite port: %w", err)
+	}
+
+	endpoint := fmt.Sprintf("http://%s:%s/%s", host, port.Port(), azuriteAccountName)
+
+	return &Azurite{
+		container: c,
+		endpoint:  endpoint,
+	}, nil
+}
+
+// Terminate stops and removes the Azurite container.
+func (az *Azurite) Terminate(ctx context.Context) error {
+	return az.container.Terminate(ctx)
+}
+
+// NewServiceClient returns an azblob.Client authenticated with the
+// well-known Azurite dev creds.
+func (az *Azurite) NewServiceClient(t *testing.T) *azblob.Client {
+	t.Helper()
+
+	cred, err := azblob.NewSharedKeyCredential(az.AccountName(), az.AccountKey())
+	if err != nil {
+		t.Fatalf("azurite shared key cred: %v", err)
+	}
+
+	cli, err := azblob.NewClientWithSharedKeyCredential(az.endpoint, cred, nil)
+	if err != nil {
+		t.Fatalf("azurite client: %v", err)
+	}
+
+	return cli
+}
+
+// NewContainer creates a fresh container and registers a cleanup. The
+// container name is returned.
+func (az *Azurite) NewContainer(ctx context.Context, t *testing.T, prefix string) string {
+	t.Helper()
+
+	cli := az.NewServiceClient(t)
+	name := uniqueName(prefix)
+
+	if _, err := cli.CreateContainer(ctx, name, nil); err != nil {
+		t.Fatalf("create container %s: %v", name, err)
+	}
+
+	t.Cleanup(func() {
+		_, _ = cli.DeleteContainer(context.Background(), name, nil) //nolint:errcheck // best-effort cleanup
+	})
+
+	return name
+}
+
+// UploadBlockBlob uploads bytes as a block blob to (container, name).
+func (az *Azurite) UploadBlockBlob(ctx context.Context, t *testing.T, ctr, name string, data []byte) {
+	t.Helper()
+
+	cli := az.NewServiceClient(t)
+	if _, err := cli.UploadBuffer(ctx, ctr, name, data, nil); err != nil {
+		t.Fatalf("upload block blob %s/%s: %v", ctr, name, err)
+	}
+}
+
+// UploadPageBlob uploads bytes as a page blob (used to exercise the
+// unsupported-blob-type rejection path in the azureblob driver). Size
+// must be a multiple of 512.
+func (az *Azurite) UploadPageBlob(ctx context.Context, t *testing.T, ctr, name string, size int64) {
+	t.Helper()
+
+	cred, err := azblob.NewSharedKeyCredential(az.AccountName(), az.AccountKey())
+	if err != nil {
+		t.Fatalf("azurite shared key cred: %v", err)
+	}
+
+	containerCli, err := container.NewClientWithSharedKeyCredential(
+		fmt.Sprintf("%s/%s", az.endpoint, ctr), cred, nil)
+	if err != nil {
+		t.Fatalf("container client: %v", err)
+	}
+
+	pbCli := containerCli.NewPageBlobClient(name)
+	if _, err := pbCli.Create(ctx, size, &pageblob.CreateOptions{
+		HTTPHeaders: &blob.HTTPHeaders{},
+	}); err != nil {
+		t.Fatalf("create page blob: %v", err)
+	}
+	// Page blobs created here are zero-filled; tests don't read content
+	// because the azureblob driver rejects non-Block-Blob types before
+	// the GET stage.
+}
+
+// uniqueName returns a short random-suffixed name suitable for
+// LocalStack buckets and Azurite containers.
+func uniqueName(prefix string) string {
+	var b [4]byte
+
+	_, _ = rand.Read(b[:]) //nolint:errcheck // crypto/rand never fails on linux
+
+	return fmt.Sprintf("%s-%s", prefix, hex.EncodeToString(b[:]))
+}
diff --git a/internal/orca/inttest/client.go b/internal/orca/inttest/client.go
new file mode 100644
index 00000000..78543451
--- /dev/null
+++ b/internal/orca/inttest/client.go
@@ -0,0 +1,127 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"encoding/xml"
+	"fmt"
+	"io"
+	"net/http"
+	"testing"
+)
+
+// Client is a thin HTTP wrapper that targets a single replica's edge
+// listener and provides typed helpers (GET, GET-Range, HEAD, LIST) for
+// test assertions.
+type Client struct {
+	BaseURL string
+	HTTP    *http.Client
+}
+
+// NewClient returns a Client targeting baseURL (e.g. http://127.0.0.1:34567).
+func NewClient(baseURL string) *Client {
+	return &Client{
+		BaseURL: baseURL,
+		HTTP:    &http.Client{},
+	}
+}
+
+// GetResponse is the result of a GET / HEAD request.
+type GetResponse struct {
+	Status int
+	Header http.Header
+	Body   []byte
+}
+
+// Get fetches the full body of /bucket/key.
+func (c *Client) Get(ctx context.Context, t *testing.T, bucket, key string) GetResponse {
+	t.Helper()
+
+	return c.do(ctx, t, http.MethodGet, fmt.Sprintf("/%s/%s", bucket, key), nil)
+}
+
+// GetRange fetches a byte range from /bucket/key.
+func (c *Client) GetRange(ctx context.Context, t *testing.T, bucket, key string, start, end int64) GetResponse {
+	t.Helper()
+
+	hdr := http.Header{}
+	hdr.Set("Range", fmt.Sprintf("bytes=%d-%d", start, end))
+
+	return c.do(ctx, t, http.MethodGet, fmt.Sprintf("/%s/%s", bucket, key), hdr)
+}
+
+// Head issues a HEAD against /bucket/key.
+func (c *Client) Head(ctx context.Context, t *testing.T, bucket, key string) GetResponse {
+	t.Helper()
+
+	return c.do(ctx, t, http.MethodHead, fmt.Sprintf("/%s/%s", bucket, key), nil)
+}
+
+// ListBucketResult mirrors the (subset) S3 ListObjectsV2 XML response
+// shape produced by the orca edge handler.
+type ListBucketResult struct {
+	XMLName  xml.Name `xml:"ListBucketResult"`
+	Name     string   `xml:"Name"`
+	Prefix   string   `xml:"Prefix"`
+	KeyCount int      `xml:"KeyCount"`
+	Contents []struct {
+		Key  string `xml:"Key"`
+		Size int64  `xml:"Size"`
+		ETag string `xml:"ETag"`
+	} `xml:"Contents"`
+}
+
+// List issues a LIST against /bucket/?list-type=2&prefix=<prefix>.
+func (c *Client) List(ctx context.Context, t *testing.T, bucket, prefix string) ListBucketResult {
+	t.Helper()
+
+	resp := c.do(ctx, t, http.MethodGet,
+		fmt.Sprintf("/%s/?list-type=2&prefix=%s", bucket, prefix), nil)
+	if resp.Status != http.StatusOK {
+		t.Fatalf("LIST status=%d body=%s", resp.Status, string(resp.Body))
+	}
+
+	var out ListBucketResult
+	if err := xml.Unmarshal(resp.Body, &out); err != nil {
+		t.Fatalf("LIST decode: %v body=%s", err, string(resp.Body))
+	}
+
+	return out
+}
+
+func (c *Client) do(ctx context.Context, t *testing.T, method, path string, hdr http.Header) GetResponse {
+	t.Helper()
+
+	req, err := http.NewRequestWithContext(ctx, method, c.BaseURL+path, nil)
+	if err != nil {
+		t.Fatalf("build request: %v", err)
+	}
+
+	for k, vs := range hdr {
+		for _, v := range vs {
+			req.Header.Add(k, v)
+		}
+	}
+
+	resp, err := c.HTTP.Do(req)
+	if err != nil {
+		t.Fatalf("%s %s: %v", method, path, err)
+	}
+
+	defer func() { _ = resp.Body.Close() }() //nolint:errcheck // body close best-effort in tests
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		t.Fatalf("read body: %v", err)
+	}
+
+	return GetResponse{
+		Status: resp.StatusCode,
+		Header: resp.Header,
+		Body:   body,
+	}
+}
diff --git a/internal/orca/inttest/doc.go b/internal/orca/inttest/doc.go
new file mode 100644
index 00000000..ac83f611
--- /dev/null
+++ b/internal/orca/inttest/doc.go
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+// Package inttest contains integration tests for the Orca cache.
+//
+// Build tag `integrationtest` gates these tests; run via:
+//
+//	make orca-inttest
+//
+// Equivalent to:
+//
+//	go test -tags=integrationtest -race -timeout 15m \
+//	  ./internal/orca/inttest/...
+//
+// # Architecture
+//
+// The harness brings up real LocalStack and Azurite containers via
+// testcontainers-go and constructs N in-process *app.App instances
+// wired to those containers. By default StartCluster runs 3 replicas,
+// matching the production deploy/orca topology.
+//
+// Every replica binds to 127.0.0.1 with an OS-assigned distinct
+// internal port; the cluster.Peer struct now carries an explicit Port
+// (zero in production, set in tests) and FillFromPeer dials peer.IP +
+// peer.Port. This lets multi-replica tests run on every platform
+// (Linux, macOS, Windows / WSL) without loopback-alias setup.
+//
+// Each replica owns its own StaticPeerSource (cluster.PeerSource).
+// Tests that need to induce membership disagreement mutate one
+// replica's source; the cluster's refresh goroutine picks up the
+// change within MembershipRefresh (250 ms in tests).
+//
+// # Container lifecycle
+//
+// TestMain starts one LocalStack and one Azurite container per
+// `go test` invocation; per-test buckets/containers prevent
+// cross-test interference.
+//
+// # File layout
+//
+//   - e2e_test.go - the canonical end-to-end suite (3 replicas).
+//     Boot-self-test, cold/warm GET, ranged GET, multi-chunk GET,
+//     LIST, HEAD, NotFound, rendezvous coordinator routing,
+//     singleflight collapse, peer-not-coordinator fallback (real).
+//   - azure_test.go - azureblob origin driver smoke against Azurite
+//     (3 replicas).
+//
+// Driver-level branch coverage (versioning gate, blob-type
+// rejection) lives as fast unit tests in the respective driver
+// packages (cachestore/s3, origin/azureblob), not here.
+//
+// # Adding a scenario
+//
+//  1. Pick the right entry point: StartCluster (3-replica default).
+//     Tests that need to assert on a boot-time failure mode that
+//     surfaces before any chunk fetch (versioning gate, blob-type
+//     rejection, etc.) should live as unit tests in the respective
+//     driver package.
+//  2. Seed the origin: SeedS3 or SeedAzure.
+//  3. Issue requests via cl.Get(i).HTTP.Get / GetRange / Head / List.
+//  4. Assert byte-exact body, status code, and (where relevant) origin
+//     RPC counts via the optional CountingOrigin or peer 409 counts via
+//     CountingInternalHandlerWrap.
+//
+// # TODO (genuinely future work)
+//
+//   - TestEtagChange (mid-fill mutation): requires a deterministic
+//     test seam in fetch.Coordinator (e.g. a hook that pauses between
+//     chunk fetches) so the test can rewrite the origin object
+//     between chunk 0 and chunk 1 of the same fill.
+//   - Fault-injection origin / cachestore decorators: useful for
+//     timeout, throttle, and 5xx retry-budget assertions.
+package inttest
diff --git a/internal/orca/inttest/e2e_test.go b/internal/orca/inttest/e2e_test.go
new file mode 100644
index 00000000..c384fc61
--- /dev/null
+++ b/internal/orca/inttest/e2e_test.go
@@ -0,0 +1,496 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"bytes"
+	"context"
+	"net/http"
+	"strconv"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+)
+
+// e2e_test.go is the canonical end-to-end suite for orca: every
+// scenario runs against a 3-replica in-process cluster pointed at
+// LocalStack. Tests that exercise chunk fetching naturally exercise
+// both the local-fill path (when self happens to win rendezvous for
+// a chunk) and the cross-replica /internal/fill path (when a peer
+// wins).
+//
+// Driver-level branch coverage (versioning gate, blob-type rejection,
+// HTTP error mapping, range parsing, chunk arithmetic, config env
+// fallback) lives as fast unit tests in the respective driver / server
+// / chunk / config packages. The scenarios here are reserved for
+// behavior that can only be verified end-to-end against real
+// LocalStack (or Azurite, in azure_test.go) plus a real cluster of
+// in-process orca instances.
+
+// TestColdAndWarmGet exercises GET twice for the same single-chunk
+// blob: cold (origin fetch + cache commit) and warm (cachestore hit).
+// The warm phase deletes the origin object first to prove the cache
+// hit really happened.
+func TestColdAndWarmGet(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 60*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	blob := SmallBlob()
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob})
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:   pkgLocalStack,
+		OriginBucket: bucket,
+	})
+
+	cold := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key)
+	if cold.Status != http.StatusOK {
+		t.Fatalf("cold status=%d body=%s", cold.Status, string(cold.Body))
+	}
+
+	if !bytes.Equal(cold.Body, blob.Data) {
+		t.Fatalf("cold body mismatch: got %d bytes, want %d", len(cold.Body), len(blob.Data))
+	}
+
+	if cold.Header.Get("ETag") == "" {
+		t.Errorf("expected ETag header on cold GET")
+	}
+
+	DeleteS3Object(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, blob.Key)
+
+	warm := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key)
+	if warm.Status != http.StatusOK {
+		t.Fatalf("warm status=%d body=%s", warm.Status, string(warm.Body))
+	}
+
+	if !bytes.Equal(warm.Body, blob.Data) {
+		t.Fatalf("warm body mismatch: got %d bytes, want %d", len(warm.Body), len(blob.Data))
+	}
+}
+
+// TestRangedGet verifies byte-range requests return 206 +
+// Content-Range + the requested slice. Covers within-chunk,
+// cross-chunk, and (against a 64-chunk blob) various boundary edge
+// cases. The chunk-arithmetic branches are unit-tested separately in
+// internal/orca/chunk; this verifies the end-to-end HTTP Range
+// round-trip with real chunk bodies.
+func TestRangedGet(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	medium := MediumBlob() // 1.5 MiB == 2 chunks at 1 MiB
+	huge := HugeBlob()     // 64 MiB == 64 chunks at 1 MiB
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{medium, huge})
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:   pkgLocalStack,
+		OriginBucket: bucket,
+	})
+
+	resp := cl.Get(1).HTTP.GetRange(ctx, t, bucket, medium.Key, 100, 199)
+	if resp.Status != http.StatusPartialContent {
+		t.Fatalf("status=%d (want 206)", resp.Status)
+	}
+
+	if cr := resp.Header.Get("Content-Range"); cr == "" {
+		t.Errorf("expected Content-Range header")
+	}
+
+	want := medium.Data[100:200]
+	if !bytes.Equal(resp.Body, want) {
+		t.Fatalf("range body mismatch: got %d bytes, want %d", len(resp.Body), len(want))
+	}
+
+	chunkSize := int64(1024 * 1024)
+	resp2 := cl.Get(1).HTTP.GetRange(ctx, t, bucket, medium.Key, chunkSize-50, chunkSize+49)
+
+	if resp2.Status != http.StatusPartialContent {
+		t.Fatalf("cross-chunk status=%d (want 206)", resp2.Status)
+	}
+
+	want2 := medium.Data[chunkSize-50 : chunkSize+50]
+	if !bytes.Equal(resp2.Body, want2) {
+		t.Fatalf("cross-chunk range mismatch: got %d bytes, want %d", len(resp2.Body), len(want2))
+	}
+
+	t.Run("huge blob boundary cases", func(t *testing.T) {
+		const chunk = int64(1024 * 1024)
+
+		cases := []struct {
+			name       string
+			start, end int64
+		}{
+			{"starts exactly at chunk boundary 32", 32 * chunk, 32*chunk + 100},
+			{"ends exactly at chunk boundary 47", 48*chunk - 100, 48*chunk - 1},
+			{"covers chunks 10-12 (3 contiguous full chunks)", 10 * chunk, 13*chunk - 1},
+			{"straddles 5 consecutive boundaries (chunks 20-25)", 20*chunk + 100, 25*chunk + 200},
+		}
+
+		for _, tc := range cases {
+			t.Run(tc.name, func(t *testing.T) {
+				rr := cl.Get(1).HTTP.GetRange(ctx, t, bucket, huge.Key, tc.start, tc.end)
+				if rr.Status != http.StatusPartialContent {
+					t.Fatalf("status=%d (want 206)", rr.Status)
+				}
+
+				expected := huge.Data[tc.start : tc.end+1]
+				if !bytes.Equal(rr.Body, expected) {
+					t.Fatalf("body mismatch: got %d bytes, want %d", len(rr.Body), len(expected))
+				}
+			})
+		}
+	})
+}
+
+// TestMultiChunkGet verifies a full GET of a 64-chunk blob assembles
+// correctly across chunk boundaries. With 3 replicas and 64 chunks,
+// rendezvous-hashed coordinator selection statistically guarantees
+// every replica is the coordinator for many chunks, so this test
+// exercises both fillLocal and FillFromPeer paths thoroughly in a
+// single run.
+func TestMultiChunkGet(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	blob := HugeBlob()
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob})
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:   pkgLocalStack,
+		OriginBucket: bucket,
+	})
+
+	resp := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key)
+	if resp.Status != http.StatusOK {
+		t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body))
+	}
+
+	if !bytes.Equal(resp.Body, blob.Data) {
+		t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data))
+	}
+}
+
+// TestRendezvousCoordinatorRouting verifies that a GET against a
+// non-coordinator replica routes through /internal/fill to the
+// coordinator and still returns the body. The CountingOrigin
+// decorator confirms exactly one origin GetRange happened across the
+// cluster (the coordinator's).
+func TestRendezvousCoordinatorRouting(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	blob := SmallBlob()
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob})
+
+	count := newCountingOriginForLocalStack(ctx, t, bucket)
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:     pkgLocalStack,
+		OriginBucket:   bucket,
+		OriginOverride: count,
+	})
+
+	headResp := cl.Get(1).HTTP.Head(ctx, t, bucket, blob.Key)
+
+	etag := stripQuotes(headResp.Header.Get("ETag"))
+	if etag == "" {
+		t.Fatalf("HEAD returned empty ETag: %+v", headResp.Header)
+	}
+
+	k := chunk.Key{
+		OriginID:  "inttest-origin",
+		Bucket:    bucket,
+		ObjectKey: blob.Key,
+		ETag:      etag,
+		ChunkSize: int64(1024 * 1024),
+		Index:     0,
+	}
+	coord := cl.Get(1).App.Cluster.Coordinator(k)
+
+	var nonCoord *Replica
+
+	for _, r := range cl.Replicas {
+		if r.SelfIP != coord.IP || r.InternalPort != coord.Port {
+			nonCoord = r
+			break
+		}
+	}
+
+	if nonCoord == nil {
+		t.Fatalf("could not find a non-coordinator replica; coord=%+v peers=%+v",
+			coord, cl.Get(1).App.Cluster.Peers())
+	}
+
+	count.Reset()
+
+	resp := nonCoord.HTTP.Get(ctx, t, bucket, blob.Key)
+	if resp.Status != http.StatusOK {
+		t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body))
+	}
+
+	if !bytes.Equal(resp.Body, blob.Data) {
+		t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data))
+	}
+	// Exactly one HEAD (HeadObject metadata cache) plus one GetRange
+	// (single chunk fetch). Cluster-wide dedup must not produce more.
+	if got := count.GetRanges(); got != 1 {
+		t.Errorf("origin GetRange count=%d (want 1)", got)
+	}
+}
+
+// TestSingleflightCollapse fires N concurrent GETs (one per replica)
+// for the same key and asserts the origin saw exactly one GetRange
+// per chunk (cluster-wide singleflight collapse).
+func TestSingleflightCollapse(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	blob := HugeBlob() // 64 chunks
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob})
+
+	count := newCountingOriginForLocalStack(ctx, t, bucket)
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:     pkgLocalStack,
+		OriginBucket:   bucket,
+		OriginOverride: count,
+	})
+
+	count.Reset()
+
+	var wg sync.WaitGroup
+
+	wg.Add(cl.Len())
+
+	results := make([][]byte, cl.Len())
+	statuses := make([]int, cl.Len())
+
+	for i := 1; i <= cl.Len(); i++ {
+		go func(i int) {
+			defer wg.Done()
+
+			r := cl.Get(i).HTTP.Get(ctx, t, bucket, blob.Key)
+			results[i-1] = r.Body
+			statuses[i-1] = r.Status
+		}(i)
+	}
+
+	wg.Wait()
+
+	for i, s := range statuses {
+		if s != http.StatusOK {
+			t.Fatalf("replica %d status=%d", i+1, s)
+		}
+
+		if !bytes.Equal(results[i], blob.Data) {
+			t.Fatalf("replica %d body mismatch: got %d bytes want %d", i+1, len(results[i]), len(blob.Data))
+		}
+	}
+	// HugeBlob spans 64 chunks; cluster-wide singleflight should
+	// dedupe each chunk to exactly one origin GetRange. Allow up to
+	// 76 (~20% slack) to absorb timing-dependent races where a
+	// joiner arrives during in-flight commit.
+	if got := count.GetRanges(); got > 76 {
+		t.Errorf("origin GetRange count=%d (want <= 76 for 64-chunk blob)", got)
+	}
+
+	if got := count.GetRanges(); got < 64 {
+		t.Errorf("origin GetRange count=%d (want >= 64 for 64-chunk cold fill)", got)
+	}
+}
+
+// TestPeerNotCoordinatorFallback induces real membership disagreement
+// and asserts the coordinator's /internal/fill returns 409 and the
+// requesting replica's local-fill fallback succeeds.
+//
+// Setup:
+//
+//   - 3-replica cluster with shared CountingInternalHandlerWrap so we
+//     can read 409 counts per receiving replica.
+//   - HEAD the seeded blob to learn ETag; compute Coordinator(k) for
+//     chunk 0 from replica 1's view (call it C).
+//   - Craft a phantom peer P (an unreachable IP/Port pair) whose
+//     rendezvous score for k is higher than C's. Mutate C's peer
+//     source to include P plus C itself; now C.IsCoordinator(k)
+//     returns false because P wins.
+//   - Find another replica R whose view still says C is the
+//     coordinator. GET via R.
+//
+// Expected:
+//
+//   - R issues /internal/fill to C.
+//   - C responds 409 (its IsCoordinator returns false because P wins).
+//   - R falls through to fillLocal, fetches the origin, serves the
+//     body.
+//   - counter.Count(C, 409) >= 1.
+func TestPeerNotCoordinatorFallback(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second)
+	defer cancel()
+
+	bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin")
+	blob := SmallBlob()
+	SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob})
+
+	wrap := NewCountingInternalHandlerWrap()
+
+	cl := StartCluster(ctx, t, ClusterOptions{
+		LocalStack:          pkgLocalStack,
+		OriginBucket:        bucket,
+		InternalHandlerWrap: wrap,
+	})
+
+	headResp := cl.Get(1).HTTP.Head(ctx, t, bucket, blob.Key)
+
+	etag := stripQuotes(headResp.Header.Get("ETag"))
+	if etag == "" {
+		t.Fatalf("HEAD returned empty ETag: %+v", headResp.Header)
+	}
+
+	k := chunk.Key{
+		OriginID:  "inttest-origin",
+		Bucket:    bucket,
+		ObjectKey: blob.Key,
+		ETag:      etag,
+		ChunkSize: int64(1024 * 1024),
+		Index:     0,
+	}
+	coord := cl.Get(1).App.Cluster.Coordinator(k)
+
+	coordReplica := cl.FindBySelfIPPort(coord.IP, coord.Port)
+	if coordReplica == nil {
+		t.Fatalf("coord %+v not found among replicas", coord)
+	}
+
+	// Craft a phantom peer whose rendezvous score beats coord's for k.
+	// The phantom's IP/Port don't need to be reachable; it's never
+	// dialed, only used to skew rendezvous on coord's view.
+	pathBytes := []byte(k.Path())
+	coordScore := cluster.Score(coord, pathBytes)
+	phantom := cluster.Peer{IP: "203.0.113.1"} // TEST-NET-3, unreachable
+
+	for port := 1; port < 65536; port++ {
+		phantom.Port = port
+		if cluster.Score(phantom, pathBytes) > coordScore {
+			break
+		}
+	}
+
+	if cluster.Score(phantom, pathBytes) <= coordScore {
+		t.Fatalf("could not find a phantom peer beating coord rendezvous score")
+	}
+
+	// Build coord's new peer-set: original real peers plus the
+	// phantom. The StaticPeerSource will stamp Self=true only on the
+	// peer matching coord's (selfIP, selfPort), so coord still
+	// recognizes itself; but the phantom wins rendezvous, so
+	// coord.IsCoordinator(k) flips to false.
+	newPeers := make([]cluster.Peer, 0, cl.Len()+1)
+	for _, r := range cl.Replicas {
+		newPeers = append(newPeers, cluster.Peer{IP: r.SelfIP, Port: r.InternalPort})
+	}
+
+	newPeers = append(newPeers, phantom)
+	coordReplica.PeerSource.SetPeers(newPeers)
+
+	if err := waitForCondition(ctx, 2*time.Second, func() bool {
+		return !coordReplica.App.Cluster.IsCoordinator(k)
+	}); err != nil {
+		t.Fatalf("coord did not relinquish coordinator status: %v", err)
+	}
+	// Find a replica R whose view still says coord is the coordinator.
+	var requester *Replica
+
+	for _, r := range cl.Replicas {
+		if r == coordReplica {
+			continue
+		}
+
+		rc := r.App.Cluster.Coordinator(k)
+		if rc.IP == coord.IP && rc.Port == coord.Port {
+			requester = r
+			break
+		}
+	}
+
+	if requester == nil {
+		t.Fatalf("no non-coord replica still views coord %+v as coordinator", coord)
+	}
+
+	resp := requester.HTTP.Get(ctx, t, bucket, blob.Key)
+	if resp.Status != http.StatusOK {
+		t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body))
+	}
+
+	if !bytes.Equal(resp.Body, blob.Data) {
+		t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data))
+	}
+
+	coordKey := coord.IP + ":" + strconv.Itoa(coord.Port)
+	if got := wrap.Count(coordKey, http.StatusConflict); got < 1 {
+		t.Fatalf("expected at least one 409 from coord %s; got %d",
+			coordKey, got)
+	}
+}
+
+func newCountingOriginForLocalStack(ctx context.Context, t *testing.T, bucket string) *CountingOrigin {
+	t.Helper()
+
+	or, err := localStackOrigin(ctx, t, bucket)
+	if err != nil {
+		t.Fatalf("localStackOrigin: %v", err)
+	}
+
+	return NewCountingOrigin(or)
+}
+
+func stripQuotes(s string) string {
+	if len(s) >= 2 && s[0] == '"' && s[len(s)-1] == '"' {
+		return s[1 : len(s)-1]
+	}
+
+	return s
+}
+
+func waitForCondition(ctx context.Context, dl time.Duration, cond func() bool) error {
+	deadline := time.Now().Add(dl)
+	for time.Now().Before(deadline) {
+		if cond() {
+			return nil
+		}
+
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(25 * time.Millisecond):
+		}
+	}
+
+	if cond() {
+		return nil
+	}
+
+	return context.DeadlineExceeded
+}
diff --git a/internal/orca/inttest/harness.go b/internal/orca/inttest/harness.go
new file mode 100644
index 00000000..48a99c92
--- /dev/null
+++ b/internal/orca/inttest/harness.go
@@ -0,0 +1,378 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"log/slog"
+	"net"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/app"
+	"github.com/Azure/unbounded/internal/orca/cachestore"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// ClusterOptions controls Harness.StartCluster.
+type ClusterOptions struct {
+	// Replicas is the number of in-process orca instances. Defaults
+	// to 3 when zero, matching the production deploy/orca topology.
+	Replicas int
+
+	// ChunkSize is the per-chunk byte count. The orca config validator
+	// enforces a 1 MiB minimum; tests typically use 1 MiB to keep test
+	// blob sizes manageable while still spanning multiple chunks.
+	ChunkSize int64
+
+	// OriginID is the logical origin identifier (echoed in chunk paths).
+	OriginID string
+
+	// OriginBucket is the bucket on the origin LocalStack/Azurite.
+	OriginBucket string
+
+	// OriginDriver is "awss3" (default) or "azureblob".
+	OriginDriver string
+
+	// LocalStack is the LocalStack handle used for origin (when
+	// OriginDriver=="awss3") and always for cachestore.
+	LocalStack *LocalStack
+
+	// Azurite is required when OriginDriver=="azureblob".
+	Azurite *Azurite
+
+	// AzureContainer is the Azurite container name for the origin.
+	AzureContainer string
+
+	// CachestoreBucket is the bucket on LocalStack used as the orca
+	// cachestore. If empty, a fresh bucket is allocated.
+	CachestoreBucket string
+
+	// OriginOverride, when set, replaces the constructed origin driver.
+	// Used to wire CountingOrigin around the real client.
+	OriginOverride origin.Origin
+
+	// CacheStoreOverride, when set, replaces the constructed cachestore
+	// driver.
+	CacheStoreOverride cachestore.CacheStore
+
+	// InternalHandlerWrap, when set, is registered with each replica's
+	// app.WithInternalHandlerWrap. Tests use this to install a 409
+	// counter (CountingInternalHandlerWrap.WrapFor).
+	InternalHandlerWrap *CountingInternalHandlerWrap
+}
+
+// Replica represents one running *app.App in the harness.
+type Replica struct {
+	App          *app.App
+	SelfIP       string
+	InternalPort int
+	PeerSource   *StaticPeerSource
+	HTTP         *Client // pre-built client targeting this replica's edge
+}
+
+// Cluster is a collection of Replicas plus the harness-owned context.
+type Cluster struct {
+	Replicas []*Replica
+}
+
+// Get returns replica i (1-indexed).
+func (c *Cluster) Get(i int) *Replica { return c.Replicas[i-1] }
+
+// Len returns the replica count.
+func (c *Cluster) Len() int { return len(c.Replicas) }
+
+// FindBySelfIPPort returns the replica whose (SelfIP, InternalPort)
+// matches the given peer; nil if none.
+func (c *Cluster) FindBySelfIPPort(ip string, port int) *Replica {
+	for _, r := range c.Replicas {
+		if r.SelfIP == ip && r.InternalPort == port {
+			return r
+		}
+	}
+
+	return nil
+}
+
+// StartCluster brings up `opts.Replicas` orca instances (default 3)
+// pointed at the origin/cachestore described in opts. Every replica
+// binds to 127.0.0.1 with an OS-assigned distinct internal port; one
+// StaticPeerSource per replica is initialized with the full peer set
+// (with explicit ports). Tests can mutate any replica's PeerSource
+// independently.
+//
+// Cleanup (Shutdown of each app) is registered with t.Cleanup.
+func StartCluster(ctx context.Context, t *testing.T, opts ClusterOptions) *Cluster {
+	t.Helper()
+
+	if opts.Replicas == 0 {
+		opts.Replicas = 3
+	}
+
+	if opts.Replicas < 1 {
+		t.Fatalf("StartCluster: Replicas must be >= 1, got %d", opts.Replicas)
+	}
+
+	if opts.ChunkSize == 0 {
+		opts.ChunkSize = 1024 * 1024
+	}
+
+	if opts.OriginDriver == "" {
+		opts.OriginDriver = "awss3"
+	}
+
+	if opts.OriginID == "" {
+		opts.OriginID = "inttest-origin"
+	}
+
+	if opts.LocalStack == nil {
+		t.Fatal("StartCluster: LocalStack handle required")
+	}
+
+	if opts.OriginDriver == "azureblob" {
+		if opts.Azurite == nil {
+			t.Fatal("StartCluster: Azurite handle required for azureblob driver")
+		}
+
+		if opts.AzureContainer == "" {
+			t.Fatal("StartCluster: AzureContainer required for azureblob driver")
+		}
+	}
+
+	if opts.OriginBucket == "" && opts.OriginDriver == "awss3" {
+		t.Fatal("StartCluster: OriginBucket required for awss3 driver")
+	}
+
+	cacheBucket := opts.CachestoreBucket
+	if cacheBucket == "" {
+		cacheBucket = opts.LocalStack.NewBucket(ctx, t, "orca-cache")
+	}
+
+	// Allocate per-replica internal listeners up front (open) so each
+	// replica's peer source can advertise the full set with explicit
+	// ports from t=0. We hand the open listeners to app.Start via
+	// WithInternalListener/WithEdgeListener/WithOpsListener so there
+	// is no close-and-rebind window for races with concurrent tests.
+	internalListeners := make([]net.Listener, opts.Replicas)
+	internalPorts := make([]int, opts.Replicas)
+	edgeListeners := make([]net.Listener, opts.Replicas)
+	opsListeners := make([]net.Listener, opts.Replicas)
+
+	for i := range internalListeners {
+		ln, err := net.Listen("tcp", "127.0.0.1:0")
+		if err != nil {
+			closeListeners(internalListeners)
+			closeListeners(edgeListeners)
+			closeListeners(opsListeners)
+			t.Fatalf("alloc internal port for replica %d: %v", i+1, err)
+		}
+
+		internalListeners[i] = ln
+		internalPorts[i] = ln.Addr().(*net.TCPAddr).Port //nolint:errcheck // *net.TCPAddr from net.Listen
+
+		eln, err := net.Listen("tcp", "127.0.0.1:0")
+		if err != nil {
+			closeListeners(internalListeners)
+			closeListeners(edgeListeners)
+			closeListeners(opsListeners)
+			t.Fatalf("alloc edge port for replica %d: %v", i+1, err)
+		}
+
+		edgeListeners[i] = eln
+
+		oln, err := net.Listen("tcp", "127.0.0.1:0")
+		if err != nil {
+			closeListeners(internalListeners)
+			closeListeners(edgeListeners)
+			closeListeners(opsListeners)
+			t.Fatalf("alloc ops port for replica %d: %v", i+1, err)
+		}
+
+		opsListeners[i] = oln
+	}
+
+	allPeers := make([]cluster.Peer, opts.Replicas)
+	for i := range allPeers {
+		allPeers[i] = cluster.Peer{
+			IP:   "127.0.0.1",
+			Port: internalPorts[i],
+		}
+	}
+
+	cl := &Cluster{}
+
+	logger := slog.New(slog.NewTextHandler(io.Discard, nil))
+
+	for i := 0; i < opts.Replicas; i++ {
+		selfIP := "127.0.0.1"
+		selfPort := internalPorts[i]
+		ps := NewStaticPeerSource(selfIP, selfPort, allPeers)
+
+		cfg := buildConfig(opts, cacheBucket)
+		cfg.Cluster.SelfPodIP = selfIP
+		cfg.Cluster.InternalListen = net.JoinHostPort(selfIP, strconv.Itoa(selfPort))
+		cfg.Server.Listen = edgeListeners[i].Addr().String()
+
+		appOpts := []app.Option{
+			app.WithLogger(logger),
+			app.WithPeerSource(ps),
+			app.WithEdgeListener(edgeListeners[i]),
+			app.WithInternalListener(internalListeners[i]),
+			app.WithOpsListener(opsListeners[i]),
+		}
+
+		if opts.OriginOverride != nil {
+			appOpts = append(appOpts, app.WithOrigin(opts.OriginOverride))
+		}
+
+		if opts.CacheStoreOverride != nil {
+			appOpts = append(appOpts, app.WithCacheStore(opts.CacheStoreOverride))
+		}
+
+		if opts.InternalHandlerWrap != nil {
+			appOpts = append(appOpts, app.WithInternalHandlerWrap(opts.InternalHandlerWrap.WrapFor(selfIP+":"+strconv.Itoa(selfPort))))
+		}
+
+		a, err := app.Start(ctx, cfg, appOpts...)
+		if err != nil {
+			t.Fatalf("app.Start replica %d: %v", i+1, err)
+		}
+
+		r := &Replica{
+			App:          a,
+			SelfIP:       selfIP,
+			InternalPort: selfPort,
+			PeerSource:   ps,
+			HTTP:         NewClient("http://" + a.EdgeAddr),
+		}
+		cl.Replicas = append(cl.Replicas, r)
+
+		t.Cleanup(func() {
+			ctxShut, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+			defer cancel()
+
+			_ = a.Shutdown(ctxShut) //nolint:errcheck // shutdown logs already emitted
+		})
+	}
+	// Wait for every replica's Cluster.Peers() to converge to the
+	// full set.
+	if err := waitForPeers(ctx, cl, opts.Replicas, 2*time.Second); err != nil {
+		t.Fatalf("waitForPeers: %v", err)
+	}
+
+	return cl
+}
+
+func buildConfig(opts ClusterOptions, cacheBucket string) *config.Config {
+	cfg := &config.Config{
+		Server: config.Server{
+			Listen: "127.0.0.1:0",
+			Auth:   config.ServerAuth{Enabled: false},
+		},
+		Origin: config.Origin{
+			ID:           opts.OriginID,
+			Driver:       opts.OriginDriver,
+			TargetGlobal: 32,
+			QueueTimeout: 5 * time.Second,
+			Retry: config.OriginRetry{
+				Attempts:         2,
+				BackoffInitial:   10 * time.Millisecond,
+				BackoffMax:       50 * time.Millisecond,
+				MaxTotalDuration: 2 * time.Second,
+			},
+		},
+		Cachestore: config.Cachestore{
+			Driver: "s3",
+			S3: config.CachestoreS3{
+				Endpoint:     opts.LocalStack.Endpoint(),
+				Bucket:       cacheBucket,
+				Region:       opts.LocalStack.Region(),
+				AccessKey:    opts.LocalStack.AccessKey(),
+				SecretKey:    opts.LocalStack.SecretKey(),
+				UsePathStyle: true,
+			},
+		},
+		Cluster: config.Cluster{
+			Service:           "orca-peers.test.svc.cluster.local",
+			MembershipRefresh: 250 * time.Millisecond,
+			InternalListen:    "127.0.0.1:0", // overridden per replica
+			InternalTLS:       config.InternalTLS{Enabled: false},
+			TargetReplicas:    opts.Replicas,
+			SelfPodIP:         "127.0.0.1", // overridden per replica
+		},
+		ChunkCatalog: config.ChunkCatalog{MaxEntries: 1024},
+		Metadata: config.Metadata{
+			TTL:         5 * time.Minute,
+			NegativeTTL: 5 * time.Second,
+			MaxEntries:  1024,
+		},
+		Chunking: config.Chunking{Size: opts.ChunkSize},
+	}
+
+	switch opts.OriginDriver {
+	case "awss3":
+		cfg.Origin.AWSS3 = config.AWSS3{
+			Endpoint:     opts.LocalStack.Endpoint(),
+			Region:       opts.LocalStack.Region(),
+			Bucket:       opts.OriginBucket,
+			AccessKey:    opts.LocalStack.AccessKey(),
+			SecretKey:    opts.LocalStack.SecretKey(),
+			UsePathStyle: true,
+		}
+	case "azureblob":
+		cfg.Origin.Azureblob = config.Azureblob{
+			Account:    opts.Azurite.AccountName(),
+			AccountKey: opts.Azurite.AccountKey(),
+			Container:  opts.AzureContainer,
+			Endpoint:   opts.Azurite.Endpoint(),
+		}
+	}
+
+	return cfg
+}
+
+// waitForPeers polls each replica's cluster.Peers() until every
+// replica has at least the expected count or the deadline elapses.
+func waitForPeers(ctx context.Context, cl *Cluster, want int, dl time.Duration) error {
+	deadline := time.Now().Add(dl)
+
+	for time.Now().Before(deadline) {
+		ok := true
+
+		for _, r := range cl.Replicas {
+			if len(r.App.Cluster.Peers()) < want {
+				ok = false
+				break
+			}
+		}
+
+		if ok {
+			return nil
+		}
+
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(50 * time.Millisecond):
+		}
+	}
+
+	return fmt.Errorf("peer-set did not converge to %d on all %d replicas within %s",
+		want, len(cl.Replicas), dl)
+}
+
+func closeListeners(lns []net.Listener) {
+	for _, ln := range lns {
+		if ln != nil {
+			_ = ln.Close() //nolint:errcheck // best-effort cleanup
+		}
+	}
+}
diff --git a/internal/orca/inttest/images.go b/internal/orca/inttest/images.go
new file mode 100644
index 00000000..d90aaba9
--- /dev/null
+++ b/internal/orca/inttest/images.go
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+// Pinned container image tags. Bump centrally when upgrading.
+const (
+	// localstackImage is the LocalStack image used for both the origin
+	// (awss3) and cachestore (s3) backends. Pinned to 3.8 because
+	// later LocalStack tags require the AWS SDK CRC64NVME checksum
+	// opt-out (which the cachestore/s3 driver and this harness's S3
+	// client builder both apply).
+	localstackImage = "localstack/localstack:3.8"
+
+	// azuriteImage is the Azurite (Azure Blob emulator) image. We pin
+	// to a specific minor for reproducibility.
+	azuriteImage = "mcr.microsoft.com/azure-storage/azurite:3.34.0"
+
+	// azuritePort is the blob-service port published by Azurite.
+	azuritePort = "10000"
+
+	// azuriteAccountName is the well-known Azurite dev account.
+	azuriteAccountName = "devstoreaccount1"
+
+	// azuriteAccountKey is the well-known Azurite dev account key. It
+	// is hard-coded by the emulator; not a secret.
+	azuriteAccountKey = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="
+)
diff --git a/internal/orca/inttest/internalwrap.go b/internal/orca/inttest/internalwrap.go
new file mode 100644
index 00000000..78d29233
--- /dev/null
+++ b/internal/orca/inttest/internalwrap.go
@@ -0,0 +1,145 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"net/http"
+	"sync"
+	"sync/atomic"
+)
+
+// CountingInternalHandlerWrap is an http.Handler decorator factory
+// that counts response status codes per receiving replica IP. Used
+// by TestPeerNotCoordinatorFallback to assert a peer's
+// /internal/fill handler returned 409 (proving the cluster.go 409
+// fallback path actually fired on the requesting replica).
+//
+// One CountingInternalHandlerWrap is shared across all replicas in
+// the harness; each replica's wrapped handler stamps its self IP
+// onto the response writer so counts can be attributed back.
+type CountingInternalHandlerWrap struct {
+	mu      sync.Mutex
+	counts  map[string]map[int]*atomic.Int64 // selfIP -> status -> count
+	defined map[string]struct{}
+}
+
+// NewCountingInternalHandlerWrap returns an empty wrapper.
+func NewCountingInternalHandlerWrap() *CountingInternalHandlerWrap {
+	return &CountingInternalHandlerWrap{
+		counts:  make(map[string]map[int]*atomic.Int64),
+		defined: make(map[string]struct{}),
+	}
+}
+
+// WrapFor returns a wrap function suitable for app.WithInternalHandlerWrap
+// that attributes status-code counts back to the named selfIP.
+func (w *CountingInternalHandlerWrap) WrapFor(selfIP string) func(http.Handler) http.Handler {
+	w.mu.Lock()
+	if _, ok := w.counts[selfIP]; !ok {
+		w.counts[selfIP] = make(map[int]*atomic.Int64)
+	}
+
+	w.defined[selfIP] = struct{}{}
+	w.mu.Unlock()
+
+	return func(next http.Handler) http.Handler {
+		return http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+			cw := &countingResponseWriter{ResponseWriter: rw, status: http.StatusOK}
+			next.ServeHTTP(cw, req)
+			w.record(selfIP, cw.status)
+		})
+	}
+}
+
+// Count returns the number of responses with the given status code
+// observed at the named selfIP.
+func (w *CountingInternalHandlerWrap) Count(selfIP string, status int) int64 {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	byStatus, ok := w.counts[selfIP]
+	if !ok {
+		return 0
+	}
+
+	c, ok := byStatus[status]
+	if !ok {
+		return 0
+	}
+
+	return c.Load()
+}
+
+// CountAcross returns the count summed across all known selfIPs.
+func (w *CountingInternalHandlerWrap) CountAcross(status int) int64 {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	var total int64
+
+	for _, byStatus := range w.counts {
+		if c, ok := byStatus[status]; ok {
+			total += c.Load()
+		}
+	}
+
+	return total
+}
+
+func (w *CountingInternalHandlerWrap) record(selfIP string, status int) {
+	w.mu.Lock()
+
+	byStatus, ok := w.counts[selfIP]
+	if !ok {
+		byStatus = make(map[int]*atomic.Int64)
+		w.counts[selfIP] = byStatus
+	}
+
+	c, ok := byStatus[status]
+	if !ok {
+		c = &atomic.Int64{}
+		byStatus[status] = c
+	}
+
+	w.mu.Unlock()
+	c.Add(1)
+}
+
+// countingResponseWriter records the first WriteHeader status; if no
+// WriteHeader is ever called, http.StatusOK is recorded (matching the
+// net/http default).
+type countingResponseWriter struct {
+	http.ResponseWriter
+	status      int
+	wroteHeader bool
+}
+
+func (c *countingResponseWriter) WriteHeader(status int) {
+	if !c.wroteHeader {
+		c.status = status
+		c.wroteHeader = true
+	}
+
+	c.ResponseWriter.WriteHeader(status)
+}
+
+func (c *countingResponseWriter) Write(p []byte) (int, error) {
+	if !c.wroteHeader {
+		c.wroteHeader = true
+	}
+
+	return c.ResponseWriter.Write(p)
+}
+
+// Flush passes through to the embedded ResponseWriter when it
+// implements http.Flusher. Without this method, wrapping a handler
+// that streams via Flush() (e.g. the edge handler's per-chunk
+// f.Flush()) would silently degrade to buffered responses.
+func (c *countingResponseWriter) Flush() {
+	if fl, ok := c.ResponseWriter.(http.Flusher); ok {
+		fl.Flush()
+	}
+}
diff --git a/internal/orca/inttest/localstack.go b/internal/orca/inttest/localstack.go
new file mode 100644
index 00000000..5abb404d
--- /dev/null
+++ b/internal/orca/inttest/localstack.go
@@ -0,0 +1,180 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"fmt"
+	"testing"
+
+	"github.com/aws/aws-sdk-go-v2/aws"
+	awsconfig "github.com/aws/aws-sdk-go-v2/config"
+	"github.com/aws/aws-sdk-go-v2/credentials"
+	"github.com/aws/aws-sdk-go-v2/service/s3"
+	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
+	"github.com/testcontainers/testcontainers-go"
+	"github.com/testcontainers/testcontainers-go/wait"
+)
+
+// LocalStack is a running LocalStack container with helper accessors
+// for constructing AWS S3 clients pointed at it. Use NewS3Client to
+// get a configured client; use NewBucket to allocate a fresh bucket
+// for a single test.
+type LocalStack struct {
+	container testcontainers.Container
+	endpoint  string
+	region    string
+}
+
+// AccessKey returns the LocalStack-default access key. LocalStack does
+// not validate credentials but the AWS SDK requires non-empty values.
+func (ls *LocalStack) AccessKey() string { return "test" }
+
+// SecretKey returns the LocalStack-default secret key.
+func (ls *LocalStack) SecretKey() string { return "test" }
+
+// Endpoint returns the http:// URL of the LocalStack edge port.
+func (ls *LocalStack) Endpoint() string { return ls.endpoint }
+
+// Region returns the static region the harness uses with LocalStack.
+func (ls *LocalStack) Region() string { return ls.region }
+
+// StartLocalStack launches a LocalStack container and returns a handle
+// once the edge port is healthy. Caller is responsible for terminating
+// the container (via container.Terminate or t.Cleanup).
+func StartLocalStack(ctx context.Context) (*LocalStack, error) {
+	req := testcontainers.ContainerRequest{
+		Image:        localstackImage,
+		ExposedPorts: []string{"4566/tcp"},
+		Env: map[string]string{
+			"SERVICES": "s3",
+			// LocalStack 3.8 returns InvalidRequest on the SDK's
+			// CRC64NVME default checksum. The orca s3 driver opts out
+			// at the SDK config level, but seeding clients in tests
+			// must do the same. We set the variables both in the
+			// container env (for any in-container tooling) and on the
+			// SDK config in NewS3Client.
+			"S3_SKIP_SIGNATURE_VALIDATION": "1",
+		},
+		WaitingFor: wait.ForHTTP("/_localstack/health").
+			WithPort("4566/tcp").
+			WithStatusCodeMatcher(func(status int) bool { return status == 200 }),
+	}
+
+	c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
+		ContainerRequest: req,
+		Started:          true,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("start localstack: %w", err)
+	}
+
+	host, err := c.Host(ctx)
+	if err != nil {
+		_ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup
+		return nil, fmt.Errorf("localstack host: %w", err)
+	}
+
+	port, err := c.MappedPort(ctx, "4566/tcp")
+	if err != nil {
+		_ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup
+		return nil, fmt.Errorf("localstack port: %w", err)
+	}
+
+	return &LocalStack{
+		container: c,
+		endpoint:  fmt.Sprintf("http://%s:%s", host, port.Port()),
+		region:    "us-east-1",
+	}, nil
+}
+
+// Terminate stops and removes the LocalStack container.
+func (ls *LocalStack) Terminate(ctx context.Context) error {
+	return ls.container.Terminate(ctx)
+}
+
+// NewS3Client returns an AWS S3 client with LocalStack-friendly
+// settings (path-style addressing, dummy credentials, checksum quirks
+// disabled).
+func (ls *LocalStack) NewS3Client(ctx context.Context, t *testing.T) *s3.Client {
+	t.Helper()
+
+	cfg, err := awsconfig.LoadDefaultConfig(ctx,
+		awsconfig.WithRegion(ls.region),
+		awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(
+			ls.AccessKey(), ls.SecretKey(), "",
+		)),
+		awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired),
+		awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired),
+	)
+	if err != nil {
+		t.Fatalf("aws config: %v", err)
+	}
+
+	return s3.NewFromConfig(cfg, func(o *s3.Options) {
+		o.BaseEndpoint = aws.String(ls.endpoint)
+		o.UsePathStyle = true
+	})
+}
+
+// NewBucket creates a fresh bucket and registers a t.Cleanup hook to
+// best-effort delete it. Returns the bucket name.
+func (ls *LocalStack) NewBucket(ctx context.Context, t *testing.T, prefix string) string {
+	t.Helper()
+
+	cli := ls.NewS3Client(ctx, t)
+	name := uniqueName(prefix)
+
+	if _, err := cli.CreateBucket(ctx, &s3.CreateBucketInput{
+		Bucket: aws.String(name),
+	}); err != nil {
+		t.Fatalf("create bucket %s: %v", name, err)
+	}
+
+	t.Cleanup(func() {
+		emptyBucket(context.Background(), cli, name)
+
+		_, _ = cli.DeleteBucket(context.Background(), &s3.DeleteBucketInput{ //nolint:errcheck // best-effort cleanup
+			Bucket: aws.String(name),
+		})
+	})
+
+	return name
+}
+
+// EnableVersioning toggles versioning on a bucket. Used by the
+// versioning-gate negative test.
+func (ls *LocalStack) EnableVersioning(ctx context.Context, t *testing.T, bucket string) {
+	t.Helper()
+
+	cli := ls.NewS3Client(ctx, t)
+	if _, err := cli.PutBucketVersioning(ctx, &s3.PutBucketVersioningInput{
+		Bucket: aws.String(bucket),
+		VersioningConfiguration: &s3types.VersioningConfiguration{
+			Status: s3types.BucketVersioningStatusEnabled,
+		},
+	}); err != nil {
+		t.Fatalf("enable versioning on %s: %v", bucket, err)
+	}
+}
+
+// emptyBucket deletes every object in the bucket. Best-effort; errors
+// are ignored.
+func emptyBucket(ctx context.Context, cli *s3.Client, bucket string) {
+	out, err := cli.ListObjectsV2(ctx, &s3.ListObjectsV2Input{
+		Bucket: aws.String(bucket),
+	})
+	if err != nil {
+		return
+	}
+
+	for _, obj := range out.Contents {
+		_, _ = cli.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort cleanup
+			Bucket: aws.String(bucket),
+			Key:    obj.Key,
+		})
+	}
+}
diff --git a/internal/orca/inttest/main_test.go b/internal/orca/inttest/main_test.go
new file mode 100644
index 00000000..f793abd6
--- /dev/null
+++ b/internal/orca/inttest/main_test.go
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"testing"
+	"time"
+)
+
+// Package-level container handles shared across tests in this package.
+// TestMain brings them up once and tears them down at the end.
+var (
+	pkgLocalStack *LocalStack
+	pkgAzurite    *Azurite
+)
+
+// TestMain provisions LocalStack + Azurite once per `go test` run.
+// Per-test buckets / containers are allocated inside individual tests
+// to avoid cross-test interference.
+func TestMain(m *testing.M) {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+
+	ls, err := StartLocalStack(ctx)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "TestMain: start localstack: %v\n", err)
+		os.Exit(1)
+	}
+
+	pkgLocalStack = ls
+
+	az, err := StartAzurite(ctx)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "TestMain: start azurite: %v\n", err)
+
+		_ = ls.Terminate(ctx) //nolint:errcheck // best-effort cleanup
+
+		os.Exit(1)
+	}
+
+	pkgAzurite = az
+
+	code := m.Run()
+
+	termCtx, termCancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer termCancel()
+
+	_ = pkgAzurite.Terminate(termCtx)    //nolint:errcheck // best-effort
+	_ = pkgLocalStack.Terminate(termCtx) //nolint:errcheck // best-effort
+
+	os.Exit(code)
+}
diff --git a/internal/orca/inttest/origins_test.go b/internal/orca/inttest/origins_test.go
new file mode 100644
index 00000000..df4012f6
--- /dev/null
+++ b/internal/orca/inttest/origins_test.go
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"testing"
+
+	"github.com/Azure/unbounded/internal/orca/origin"
+	"github.com/Azure/unbounded/internal/orca/origin/awss3"
+)
+
+// localStackOrigin builds an awss3.Origin pointed at the package-level
+// LocalStack with the given bucket. Used by tests that need to wrap
+// the origin in a CountingOrigin decorator.
+func localStackOrigin(ctx context.Context, t *testing.T, bucket string) (origin.Origin, error) {
+	t.Helper()
+
+	return awss3.New(ctx, awss3.Config{
+		Endpoint:     pkgLocalStack.Endpoint(),
+		Region:       pkgLocalStack.Region(),
+		Bucket:       bucket,
+		AccessKey:    pkgLocalStack.AccessKey(),
+		SecretKey:    pkgLocalStack.SecretKey(),
+		UsePathStyle: true,
+	}, nil)
+}
diff --git a/internal/orca/inttest/originwrap.go b/internal/orca/inttest/originwrap.go
new file mode 100644
index 00000000..c215d9e8
--- /dev/null
+++ b/internal/orca/inttest/originwrap.go
@@ -0,0 +1,67 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"io"
+	"sync/atomic"
+
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// CountingOrigin is an origin.Origin decorator that counts Head and
+// GetRange calls. It is used by tests that need to assert
+// singleflight collapse and coordinator routing.
+type CountingOrigin struct {
+	inner origin.Origin
+
+	heads     atomic.Int64
+	getRanges atomic.Int64
+	lists     atomic.Int64
+}
+
+// NewCountingOrigin wraps inner with call counters.
+func NewCountingOrigin(inner origin.Origin) *CountingOrigin {
+	return &CountingOrigin{inner: inner}
+}
+
+// Heads returns the number of Head() calls observed.
+func (c *CountingOrigin) Heads() int64 { return c.heads.Load() }
+
+// GetRanges returns the number of GetRange() calls observed.
+func (c *CountingOrigin) GetRanges() int64 { return c.getRanges.Load() }
+
+// Lists returns the number of List() calls observed.
+func (c *CountingOrigin) Lists() int64 { return c.lists.Load() }
+
+// Reset zeroes all counters.
+func (c *CountingOrigin) Reset() {
+	c.heads.Store(0)
+	c.getRanges.Store(0)
+	c.lists.Store(0)
+}
+
+// Head implements origin.Origin.
+func (c *CountingOrigin) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	c.heads.Add(1)
+
+	return c.inner.Head(ctx, bucket, key)
+}
+
+// GetRange implements origin.Origin.
+func (c *CountingOrigin) GetRange(ctx context.Context, bucket, key, etag string, off, length int64) (io.ReadCloser, error) {
+	c.getRanges.Add(1)
+
+	return c.inner.GetRange(ctx, bucket, key, etag, off, length)
+}
+
+// List implements origin.Origin.
+func (c *CountingOrigin) List(ctx context.Context, bucket, prefix, marker string, maxKeys int) (origin.ListResult, error) {
+	c.lists.Add(1)
+
+	return c.inner.List(ctx, bucket, prefix, marker, maxKeys)
+}
diff --git a/internal/orca/inttest/peersource.go b/internal/orca/inttest/peersource.go
new file mode 100644
index 00000000..c349f601
--- /dev/null
+++ b/internal/orca/inttest/peersource.go
@@ -0,0 +1,67 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"context"
+	"sync"
+
+	"github.com/Azure/unbounded/internal/orca/cluster"
+)
+
+// StaticPeerSource implements cluster.PeerSource with a mutable peer
+// list. Each replica in the harness owns its own StaticPeerSource so
+// tests can mutate one replica's view of the cluster independently
+// (used by TestPeerNotCoordinatorFallback to induce membership
+// disagreement).
+//
+// The source knows its calling replica's identity (selfIP, selfPort)
+// so it can stamp Peer.Self correctly even when multiple peers share
+// an IP (the case in tests where every replica is on 127.0.0.1).
+type StaticPeerSource struct {
+	mu       sync.Mutex
+	selfIP   string
+	selfPort int
+	peers    []cluster.Peer
+}
+
+// NewStaticPeerSource returns a peer source that stamps Self=true on
+// any peer whose (IP, Port) matches the constructor arguments.
+func NewStaticPeerSource(selfIP string, selfPort int, peers []cluster.Peer) *StaticPeerSource {
+	s := &StaticPeerSource{
+		selfIP:   selfIP,
+		selfPort: selfPort,
+	}
+	s.SetPeers(peers)
+
+	return s
+}
+
+// SetPeers replaces the current peer list. Each peer's Self bit is
+// recomputed against the source's stored (selfIP, selfPort).
+func (s *StaticPeerSource) SetPeers(peers []cluster.Peer) {
+	out := make([]cluster.Peer, len(peers))
+	for i, p := range peers {
+		p.Self = p.IP == s.selfIP && p.Port == s.selfPort
+		out[i] = p
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.peers = out
+}
+
+// Peers satisfies cluster.PeerSource.
+func (s *StaticPeerSource) Peers(_ context.Context) ([]cluster.Peer, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	out := make([]cluster.Peer, len(s.peers))
+	copy(out, s.peers)
+
+	return out, nil
+}
diff --git a/internal/orca/inttest/seed.go b/internal/orca/inttest/seed.go
new file mode 100644
index 00000000..c286bcdc
--- /dev/null
+++ b/internal/orca/inttest/seed.go
@@ -0,0 +1,96 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//go:build integrationtest
+
+package inttest
+
+import (
+	"bytes"
+	"context"
+	"testing"
+
+	"github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/service/s3"
+)
+
+// SeedBlob describes a single blob seeded into the origin.
+type SeedBlob struct {
+	Key  string
+	Data []byte
+}
+
+// SmallBlob is one chunk's-worth (1 KiB).
+func SmallBlob() SeedBlob {
+	return SeedBlob{Key: "sample-1k", Data: deterministicBytes(1024, 0xa1)}
+}
+
+// MediumBlob spans two 1 MiB chunks.
+func MediumBlob() SeedBlob {
+	return SeedBlob{Key: "sample-2chunk", Data: deterministicBytes(1024*1024+512*1024, 0xb2)}
+}
+
+// HugeBlob spans 64 chunks at the harness's 1 MiB chunk size. With 3
+// replicas, rendezvous-hashed coordinator selection statistically
+// covers every replica many times over (~21 chunks per replica),
+// so any test using HugeBlob exercises the full local-fill +
+// cross-replica /internal/fill matrix in a single run.
+func HugeBlob() SeedBlob {
+	return SeedBlob{Key: "sample-64chunk", Data: deterministicBytes(64*1024*1024, 0xd4)}
+}
+
+// AllBlobs returns the canonical seed set used across most tests.
+func AllBlobs() []SeedBlob {
+	return []SeedBlob{SmallBlob(), MediumBlob(), HugeBlob()}
+}
+
+// SeedS3 uploads each blob to the named bucket via the provided
+// LocalStack-friendly S3 client.
+func SeedS3(ctx context.Context, t *testing.T, cli *s3.Client, bucket string, blobs []SeedBlob) {
+	t.Helper()
+
+	for _, b := range blobs {
+		if _, err := cli.PutObject(ctx, &s3.PutObjectInput{
+			Bucket: aws.String(bucket),
+			Key:    aws.String(b.Key),
+			Body:   bytes.NewReader(b.Data),
+		}); err != nil {
+			t.Fatalf("seed %s/%s: %v", bucket, b.Key, err)
+		}
+	}
+}
+
+// DeleteS3Object removes a blob from a LocalStack bucket. Used by
+// warm-cache tests to prove that subsequent GETs are served from the
+// cachestore and not refetched from the origin.
+func DeleteS3Object(ctx context.Context, t *testing.T, cli *s3.Client, bucket, key string) {
+	t.Helper()
+
+	if _, err := cli.DeleteObject(ctx, &s3.DeleteObjectInput{
+		Bucket: aws.String(bucket),
+		Key:    aws.String(key),
+	}); err != nil {
+		t.Fatalf("delete origin %s/%s: %v", bucket, key, err)
+	}
+}
+
+// SeedAzure uploads each blob to the named container as block blobs.
+func SeedAzure(ctx context.Context, t *testing.T, az *Azurite, ctr string, blobs []SeedBlob) {
+	t.Helper()
+
+	for _, b := range blobs {
+		az.UploadBlockBlob(ctx, t, ctr, b.Key, b.Data)
+	}
+}
+
+// deterministicBytes returns n bytes filled with a repeating pattern
+// derived from seed. Useful for byte-exact assertions without random
+// flakiness.
+func deterministicBytes(n int, seed byte) []byte {
+	out := make([]byte, n)
+	for i := range out {
+		out[i] = seed ^ byte(i*31+17)
+	}
+
+	return out
+}
diff --git a/internal/orca/manifests/doc.go b/internal/orca/manifests/doc.go
new file mode 100644
index 00000000..a629d147
--- /dev/null
+++ b/internal/orca/manifests/doc.go
@@ -0,0 +1,12 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package manifests holds tests that validate the orca deployment
+// manifest templates render to syntactically correct, structurally
+// reasonable Kubernetes YAML.
+//
+// These tests catch typos, missing required fields, and template
+// regressions at compile time without needing a Kind cluster. They
+// complement (but do not replace) hack/orca's actual `kubectl apply`
+// validation.
+package manifests
diff --git a/internal/orca/manifests/manifests_test.go b/internal/orca/manifests/manifests_test.go
new file mode 100644
index 00000000..bbab6cab
--- /dev/null
+++ b/internal/orca/manifests/manifests_test.go
@@ -0,0 +1,307 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package manifests
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"os"
+	"path/filepath"
+	"runtime"
+	"sort"
+	"strings"
+	"testing"
+
+	"gopkg.in/yaml.v3"
+
+	"github.com/Azure/unbounded/hack/cmd/render-manifests/render"
+)
+
+// TestProductionManifestsRender renders every *.yaml.tmpl under
+// deploy/orca/ (excluding the dev/ subdirectory which contains the
+// in-Kind LocalStack/Azurite manifests) with realistic inputs and
+// asserts the output is structurally valid Kubernetes YAML.
+func TestProductionManifestsRender(t *testing.T) {
+	t.Parallel()
+
+	root := repoRoot(t)
+	templatesDir := filepath.Join(root, "deploy", "orca")
+
+	renderAndValidate(t, templatesDir, productionData(),
+		// One file at a time: walking the dev/ subdirectory is the dev
+		// suite's job, so we render-then-skip it here.
+		skipDir("dev"),
+		// Required kinds that MUST appear at least once across the
+		// rendered manifests.
+		expectKindsAtLeastOnce("Namespace", "Deployment", "Service", "ConfigMap"),
+	)
+}
+
+// TestDevManifestsRender renders the LocalStack + Azurite + init-Job
+// manifests used by the Kind dev harness.
+func TestDevManifestsRender(t *testing.T) {
+	t.Parallel()
+
+	root := repoRoot(t)
+	templatesDir := filepath.Join(root, "deploy", "orca", "dev")
+
+	renderAndValidate(t, templatesDir, devData(),
+		expectKindsAtLeastOnce("Deployment", "Service", "Job"),
+	)
+}
+
+// productionData supplies realistic template variables for the
+// production-shape templates. Templates use sprig's `default` for
+// missing keys; we set values that exercise the non-default paths
+// where it matters.
+func productionData() map[string]string {
+	return map[string]string{
+		"Namespace":               "orca-test",
+		"Image":                   "ghcr.io/example/orca:test",
+		"ImagePullPolicy":         "IfNotPresent",
+		"TargetReplicas":          "3",
+		"OriginID":                "test-origin",
+		"OriginDriver":            "awss3",
+		"OriginAWSS3Endpoint":     "http://localstack:4566",
+		"OriginAWSS3Region":       "us-east-1",
+		"OriginAWSS3Bucket":       "orca-origin",
+		"OriginAWSS3UsePathStyle": "true",
+		"CachestoreEndpoint":      "http://localstack:4566",
+		"CachestoreBucket":        "orca-cache",
+		"CachestoreRegion":        "us-east-1",
+		"ClusterService":          "orca-peers.orca-test.svc.cluster.local",
+		"ServerAuthEnabled":       "false",
+		"InternalTLSEnabled":      "false",
+		"AzureAccount":            "",
+		"AzureContainer":          "",
+		"AzureEndpoint":           "",
+	}
+}
+
+func devData() map[string]string {
+	return map[string]string{
+		"Namespace":        "orca-test",
+		"CachestoreBucket": "orca-cache",
+		"OriginBucket":     "orca-origin",
+		"AzuriteContainer": "orca-test",
+	}
+}
+
+// renderAndValidate renders every template under templatesDir into a
+// t.TempDir, then walks the output and applies each Validator.
+func renderAndValidate(t *testing.T, templatesDir string, data map[string]string, validators ...Validator) {
+	t.Helper()
+
+	outputDir := t.TempDir()
+
+	if err := render.Render(templatesDir, outputDir, data); err != nil {
+		t.Fatalf("render.Render: %v", err)
+	}
+	// Collect every rendered .yaml file. Skip directories filtered
+	// by the validators.
+	skipDirs := skipDirsOf(validators)
+
+	var renderedFiles []string
+
+	walkErr := filepath.WalkDir(outputDir, func(path string, d os.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+
+		if d.IsDir() {
+			rel, _ := filepath.Rel(outputDir, path)
+			if _, skip := skipDirs[rel]; skip {
+				return filepath.SkipDir
+			}
+
+			return nil
+		}
+
+		if strings.HasSuffix(path, ".yaml") {
+			renderedFiles = append(renderedFiles, path)
+		}
+
+		return nil
+	})
+	if walkErr != nil {
+		t.Fatalf("walk rendered output: %v", walkErr)
+	}
+
+	if len(renderedFiles) == 0 {
+		t.Fatalf("no rendered manifests found under %s", outputDir)
+	}
+
+	sort.Strings(renderedFiles)
+
+	docs := parseRenderedDocs(t, renderedFiles)
+
+	// Always-on basic structural validation.
+	for _, d := range docs {
+		validateBasicStructure(t, d)
+	}
+
+	for _, v := range validators {
+		v.Validate(t, docs)
+	}
+}
+
+// renderedDoc is one logical YAML document plus the source file it
+// came from (multi-doc files split into multiple renderedDocs).
+type renderedDoc struct {
+	SourcePath string
+	Index      int
+	Doc        map[string]any
+}
+
+func parseRenderedDocs(t *testing.T, files []string) []renderedDoc {
+	t.Helper()
+
+	var docs []renderedDoc
+
+	for _, f := range files {
+		raw, err := os.ReadFile(f)
+		if err != nil {
+			t.Fatalf("read %s: %v", f, err)
+		}
+
+		dec := yaml.NewDecoder(bytes.NewReader(raw))
+
+		for i := 0; ; i++ {
+			var doc map[string]any
+			if derr := dec.Decode(&doc); derr != nil {
+				if errors.Is(derr, io.EOF) {
+					break
+				}
+
+				t.Fatalf("yaml decode %s doc %d: %v", f, i, derr)
+			}
+
+			if doc == nil {
+				continue
+			}
+
+			docs = append(docs, renderedDoc{SourcePath: f, Index: i, Doc: doc})
+		}
+	}
+
+	return docs
+}
+
+func validateBasicStructure(t *testing.T, d renderedDoc) {
+	t.Helper()
+
+	apiVersion, _ := d.Doc["apiVersion"].(string)
+	kind, _ := d.Doc["kind"].(string)
+
+	if apiVersion == "" {
+		t.Errorf("%s doc %d: missing apiVersion", d.SourcePath, d.Index)
+	}
+
+	if kind == "" {
+		t.Errorf("%s doc %d: missing kind", d.SourcePath, d.Index)
+	}
+
+	meta, _ := d.Doc["metadata"].(map[string]any)
+	if meta == nil {
+		t.Errorf("%s doc %d (kind=%s): missing metadata", d.SourcePath, d.Index, kind)
+		return
+	}
+
+	name, _ := meta["name"].(string)
+	if name == "" {
+		t.Errorf("%s doc %d (kind=%s): missing metadata.name", d.SourcePath, d.Index, kind)
+	}
+}
+
+// Validator is a test-time check applied to the full set of
+// rendered docs.
+type Validator interface {
+	Validate(t *testing.T, docs []renderedDoc)
+	skipDir() string // empty when not a dir filter
+}
+
+type kindsAtLeastOnce struct{ kinds []string }
+
+func (v kindsAtLeastOnce) Validate(t *testing.T, docs []renderedDoc) {
+	t.Helper()
+
+	seen := map[string]bool{}
+
+	for _, d := range docs {
+		if k, _ := d.Doc["kind"].(string); k != "" {
+			seen[k] = true
+		}
+	}
+
+	for _, want := range v.kinds {
+		if !seen[want] {
+			t.Errorf("expected at least one document of kind %q, got kinds %v", want, sortedKeys(seen))
+		}
+	}
+}
+
+func (v kindsAtLeastOnce) skipDir() string { return "" }
+
+func expectKindsAtLeastOnce(kinds ...string) Validator {
+	return kindsAtLeastOnce{kinds: kinds}
+}
+
+type dirSkipper struct{ name string }
+
+func (d dirSkipper) Validate(*testing.T, []renderedDoc) {}
+
+func (d dirSkipper) skipDir() string { return d.name }
+
+func skipDir(name string) Validator {
+	return dirSkipper{name: name}
+}
+
+func skipDirsOf(vs []Validator) map[string]struct{} {
+	out := map[string]struct{}{}
+
+	for _, v := range vs {
+		if d := v.skipDir(); d != "" {
+			out[d] = struct{}{}
+		}
+	}
+
+	return out
+}
+
+func sortedKeys(m map[string]bool) []string {
+	out := make([]string, 0, len(m))
+	for k := range m {
+		out = append(out, k)
+	}
+
+	sort.Strings(out)
+
+	return out
+}
+
+// repoRoot returns the absolute path to the repo root by walking up
+// from this test file's directory until it finds a go.mod.
+func repoRoot(t *testing.T) string {
+	t.Helper()
+
+	_, file, _, ok := runtime.Caller(0)
+	if !ok {
+		t.Fatal("runtime.Caller(0) failed")
+	}
+
+	dir := filepath.Dir(file)
+	for {
+		if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil {
+			return dir
+		}
+
+		parent := filepath.Dir(dir)
+		if parent == dir {
+			t.Fatalf("reached filesystem root without finding go.mod (started at %s)", filepath.Dir(file))
+		}
+
+		dir = parent
+	}
+}
diff --git a/internal/orca/metadata/metadata.go b/internal/orca/metadata/metadata.go
new file mode 100644
index 00000000..e122463c
--- /dev/null
+++ b/internal/orca/metadata/metadata.go
@@ -0,0 +1,331 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package metadata is the per-replica object-metadata cache.
+//
+// Responsibilities:
+//   - bounded TTL'd cache of ObjectInfo keyed on (origin_id, bucket,
+//     key)
+//   - separate negative-TTL handling for 404 / unsupported-blob-type
+//     entries
+//   - per-replica HEAD singleflight so concurrent misses collapse to
+//     one Origin.Head
+package metadata
+
+import (
+	"container/list"
+	"context"
+	"encoding/binary"
+	"errors"
+	"log/slog"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// Cache is the per-replica metadata cache.
+type Cache struct {
+	cfg config.Metadata
+	log *slog.Logger
+
+	mu  sync.Mutex
+	ll  *list.List
+	idx map[string]*list.Element
+
+	sf sync.Map // map[string]*sfEntry
+}
+
+type cacheEntry struct {
+	key       string
+	info      origin.ObjectInfo
+	negative  bool
+	negErr    error
+	expiresAt time.Time
+}
+
+type sfEntry struct {
+	once sync.Once
+	done chan struct{}
+	info origin.ObjectInfo
+	err  error
+}
+
+// NewCache builds a Cache from config. The log is used at debug
+// level for cache hit / miss / record / invalidate trace lines and
+// at warn level for unexpected backend errors caught during result
+// recording. Passing nil falls back to slog.Default().
+func NewCache(cfg config.Metadata, log *slog.Logger) *Cache {
+	if cfg.MaxEntries <= 0 {
+		cfg.MaxEntries = 10_000
+	}
+
+	if cfg.TTL <= 0 {
+		cfg.TTL = 5 * time.Minute
+	}
+
+	if cfg.NegativeTTL <= 0 {
+		cfg.NegativeTTL = 60 * time.Second
+	}
+
+	if log == nil {
+		log = slog.Default()
+	}
+
+	return &Cache{
+		cfg: cfg,
+		log: log,
+		ll:  list.New(),
+		idx: make(map[string]*list.Element, cfg.MaxEntries),
+	}
+}
+
+// lookup returns the cached ObjectInfo if present and unexpired.
+//
+// Returns:
+//   - info, true,  nil  -> positive cache hit
+//   - {}, true,    err  -> negative cache hit (err is the cached error)
+//   - {}, false,   nil  -> miss; caller should LookupOrFetch
+func (c *Cache) lookup(originID, bucket, key string) (origin.ObjectInfo, bool, error) {
+	k := mkKey(originID, bucket, key)
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	el, ok := c.idx[k]
+	if !ok {
+		return origin.ObjectInfo{}, false, nil
+	}
+
+	// The list is private; we control every value inserted (always
+	// *cacheEntry). The type assertion is safe.
+	e := el.Value.(*cacheEntry) //nolint:errcheck // type invariant: list elements are *cacheEntry
+
+	if time.Now().After(e.expiresAt) {
+		c.ll.Remove(el)
+		delete(c.idx, k)
+
+		return origin.ObjectInfo{}, false, nil
+	}
+
+	c.ll.MoveToFront(el)
+
+	if e.negative {
+		return origin.ObjectInfo{}, true, e.negErr
+	}
+
+	return e.info, true, nil
+}
+
+// LookupOrFetch returns the cached ObjectInfo on hit (positive or
+// negative); on miss, runs the per-replica HEAD singleflight against
+// fetch and caches the result with the appropriate TTL.
+//
+// Singleflight tradeoff: the first caller (leader) drives fetch with
+// its own ctx. If the leader's ctx is cancelled mid-fetch, joiners
+// observe the leader's resulting ctx-error rather than their own
+// (still-valid) ctx. This is the standard singleflight contract; a
+// joiner can re-issue after seeing ctx.Err on a closed sfe.done if
+// it wants to drive its own attempt.
+func (c *Cache) LookupOrFetch(
+	ctx context.Context,
+	originID, bucket, key string,
+	fetch func(ctx context.Context) (origin.ObjectInfo, error),
+) (origin.ObjectInfo, error) {
+	if info, ok, err := c.lookup(originID, bucket, key); ok {
+		hitKind := "positive"
+		if err != nil {
+			hitKind = "negative"
+		}
+
+		c.log.LogAttrs(ctx, slog.LevelDebug, "metadata_hit",
+			slog.String("origin_id", originID),
+			slog.String("bucket", bucket),
+			slog.String("key", key),
+			slog.String("kind", hitKind),
+		)
+
+		return info, err
+	}
+
+	k := mkKey(originID, bucket, key)
+	v, _ := c.sf.LoadOrStore(k, &sfEntry{done: make(chan struct{})})
+
+	// The sync.Map only ever holds *sfEntry; the type assertion is safe.
+	sfe := v.(*sfEntry) //nolint:errcheck // type invariant: sf map values are *sfEntry
+
+	first := false
+
+	sfe.once.Do(func() {
+		first = true
+	})
+
+	if first {
+		c.log.LogAttrs(ctx, slog.LevelDebug, "metadata_singleflight_leader",
+			slog.String("origin_id", originID),
+			slog.String("bucket", bucket),
+			slog.String("key", key),
+		)
+		// Delete the singleflight entry before closing done so a new
+		// caller arriving after Delete creates a fresh entry instead
+		// of silently replaying our (possibly transient-error) result.
+		// Existing joiners already loaded the old pointer and read the
+		// result via the closed done. The brief window between Delete
+		// and close where a new caller starts a concurrent fetch is
+		// benign: the new fetch either confirms or supersedes our
+		// result.
+		defer func() {
+			c.sf.Delete(k)
+			close(sfe.done)
+		}()
+
+		info, err := fetch(ctx)
+		sfe.info = info
+		sfe.err = err
+
+		c.recordResult(ctx, originID, bucket, key, info, err)
+
+		return info, err
+	}
+
+	c.log.LogAttrs(ctx, slog.LevelDebug, "metadata_singleflight_join",
+		slog.String("origin_id", originID),
+		slog.String("bucket", bucket),
+		slog.String("key", key),
+	)
+	// Joiner: wait for the leader.
+	select {
+	case <-ctx.Done():
+		return origin.ObjectInfo{}, ctx.Err()
+	case <-sfe.done:
+	}
+
+	return sfe.info, sfe.err
+}
+
+// Invalidate drops the entry.
+func (c *Cache) Invalidate(originID, bucket, key string) {
+	k := mkKey(originID, bucket, key)
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if el, ok := c.idx[k]; ok {
+		c.ll.Remove(el)
+		delete(c.idx, k)
+		c.log.LogAttrs(context.Background(), slog.LevelDebug, "metadata_invalidate",
+			slog.String("origin_id", originID),
+			slog.String("bucket", bucket),
+			slog.String("key", key),
+		)
+	}
+}
+
+func (c *Cache) recordResult(ctx context.Context, originID, bucket, key string, info origin.ObjectInfo, err error) {
+	k := mkKey(originID, bucket, key)
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	now := time.Now()
+
+	var (
+		e        *cacheEntry
+		recorded string
+		ttl      time.Duration
+	)
+
+	switch {
+	case err == nil:
+		e = &cacheEntry{key: k, info: info, expiresAt: now.Add(c.cfg.TTL)}
+		recorded = "positive"
+		ttl = c.cfg.TTL
+	case errors.Is(err, origin.ErrNotFound):
+		e = &cacheEntry{key: k, negative: true, negErr: err, expiresAt: now.Add(c.cfg.NegativeTTL)}
+		recorded = "not_found"
+		ttl = c.cfg.NegativeTTL
+	default:
+		var (
+			ube *origin.UnsupportedBlobTypeError
+			mte *origin.MissingETagError
+		)
+
+		switch {
+		case errors.As(err, &ube):
+			e = &cacheEntry{key: k, negative: true, negErr: err, expiresAt: now.Add(c.cfg.NegativeTTL)}
+			recorded = "unsupported_blob_type"
+			ttl = c.cfg.NegativeTTL
+		case errors.As(err, &mte):
+			e = &cacheEntry{key: k, negative: true, negErr: err, expiresAt: now.Add(c.cfg.NegativeTTL)}
+			recorded = "missing_etag"
+			ttl = c.cfg.NegativeTTL
+		default:
+			c.log.LogAttrs(ctx, slog.LevelDebug, "metadata_record_skip_transient",
+				slog.String("origin_id", originID),
+				slog.String("bucket", bucket),
+				slog.String("key", key),
+				slog.Any("err", err),
+			)
+			// Other transient errors not cached.
+			return
+		}
+	}
+
+	if existing, ok := c.idx[k]; ok {
+		c.ll.Remove(existing)
+		delete(c.idx, k)
+	}
+
+	el := c.ll.PushFront(e)
+
+	c.idx[k] = el
+	for c.ll.Len() > c.cfg.MaxEntries {
+		oldest := c.ll.Back()
+		if oldest == nil {
+			break
+		}
+
+		c.ll.Remove(oldest)
+
+		oldEntry := oldest.Value.(*cacheEntry) //nolint:errcheck // type invariant: list elements are *cacheEntry
+		delete(c.idx, oldEntry.key)
+	}
+
+	c.log.LogAttrs(ctx, slog.LevelDebug, "metadata_record",
+		slog.String("origin_id", originID),
+		slog.String("bucket", bucket),
+		slog.String("key", key),
+		slog.String("kind", recorded),
+		slog.Duration("ttl", ttl),
+	)
+}
+
+// mkKey builds an in-memory cache key from (originID, bucket, key).
+// The encoding is length-prefixed: each field is written as an
+// 8-byte little-endian length followed by the field bytes. This
+// guarantees that two distinct triples cannot collide on the
+// rendered key. A naive 'origin|bucket|key' concatenation would
+// alias e.g. (origin="a|b", bucket="c", key="d") and
+// (origin="a", bucket="b|c", key="d") because S3 object keys may
+// legally contain '|'. The cache is purely in-memory so this
+// encoding has no on-disk compatibility implications.
+func mkKey(originID, bucket, key string) string {
+	var b strings.Builder
+
+	b.Grow(24 + len(originID) + len(bucket) + len(key))
+	writeLP(&b, originID)
+	writeLP(&b, bucket)
+	writeLP(&b, key)
+
+	return b.String()
+}
+
+func writeLP(b *strings.Builder, s string) {
+	var lenBuf [8]byte
+
+	binary.LittleEndian.PutUint64(lenBuf[:], uint64(len(s)))
+	b.Write(lenBuf[:])
+	b.WriteString(s)
+}
diff --git a/internal/orca/metadata/metadata_test.go b/internal/orca/metadata/metadata_test.go
new file mode 100644
index 00000000..81b25283
--- /dev/null
+++ b/internal/orca/metadata/metadata_test.go
@@ -0,0 +1,261 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package metadata
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"io"
+	"log/slog"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// TestLookupOrFetch_TransientErrorNotReplayed verifies that after the
+// leader of a singleflight fetch returns a transient (non-cached)
+// error, a subsequent call to LookupOrFetch invokes fetch again
+// rather than silently replaying the cached error.
+//
+// Regression test for the defer-order race: with `close(done)` before
+// `Delete`, a second caller arriving in the gap would land on the
+// stale singleflight entry and skip fetch entirely.
+func TestLookupOrFetch_TransientErrorNotReplayed(t *testing.T) {
+	t.Parallel()
+
+	c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil)
+
+	var calls atomic.Int64
+
+	transientErr := errors.New("transient: try again")
+
+	fetch := func(_ context.Context) (origin.ObjectInfo, error) {
+		calls.Add(1)
+		return origin.ObjectInfo{}, transientErr
+	}
+
+	// Sequential calls: each must invoke fetch, never replay.
+	for i := 0; i < 5; i++ {
+		_, err := c.LookupOrFetch(t.Context(), "origin", "bucket", "key", fetch)
+		if !errors.Is(err, transientErr) {
+			t.Fatalf("call %d: err=%v want %v", i, err, transientErr)
+		}
+	}
+
+	if got := calls.Load(); got != 5 {
+		t.Errorf("fetch invoked %d times, want 5 (transient errors must not be cached)", got)
+	}
+}
+
+// TestLookupOrFetch_PositiveResultCached verifies positive results
+// are served from the cache without re-invoking fetch.
+func TestLookupOrFetch_PositiveResultCached(t *testing.T) {
+	t.Parallel()
+
+	c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil)
+
+	var calls atomic.Int64
+
+	want := origin.ObjectInfo{Size: 1234, ETag: "abc"}
+
+	fetch := func(_ context.Context) (origin.ObjectInfo, error) {
+		calls.Add(1)
+		return want, nil
+	}
+
+	for i := 0; i < 5; i++ {
+		got, err := c.LookupOrFetch(t.Context(), "origin", "bucket", "key", fetch)
+		if err != nil {
+			t.Fatalf("call %d: err=%v", i, err)
+		}
+
+		if got != want {
+			t.Errorf("call %d: got %+v want %+v", i, got, want)
+		}
+	}
+
+	if got := calls.Load(); got != 1 {
+		t.Errorf("fetch invoked %d times, want 1 (positive results must be cached)", got)
+	}
+}
+
+// TestLookupOrFetch_NotFoundCached verifies origin.ErrNotFound is
+// negatively cached and replayed without re-invoking fetch.
+func TestLookupOrFetch_NotFoundCached(t *testing.T) {
+	t.Parallel()
+
+	c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil)
+
+	var calls atomic.Int64
+
+	fetch := func(_ context.Context) (origin.ObjectInfo, error) {
+		calls.Add(1)
+		return origin.ObjectInfo{}, origin.ErrNotFound
+	}
+
+	for i := 0; i < 3; i++ {
+		_, err := c.LookupOrFetch(t.Context(), "origin", "bucket", "key", fetch)
+		if !errors.Is(err, origin.ErrNotFound) {
+			t.Fatalf("call %d: err=%v want ErrNotFound", i, err)
+		}
+	}
+
+	if got := calls.Load(); got != 1 {
+		t.Errorf("fetch invoked %d times, want 1 (ErrNotFound must be negatively cached)", got)
+	}
+}
+
+// TestLookupOrFetch_ConcurrentJoinersCollapse verifies that
+// simultaneous callers for the same key collapse to a single fetch.
+func TestLookupOrFetch_ConcurrentJoinersCollapse(t *testing.T) {
+	t.Parallel()
+
+	c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil)
+
+	var calls atomic.Int64
+
+	gate := make(chan struct{})
+	want := origin.ObjectInfo{Size: 42}
+
+	fetch := func(_ context.Context) (origin.ObjectInfo, error) {
+		calls.Add(1)
+		<-gate // pin the leader until joiners have arrived
+
+		return want, nil
+	}
+
+	const n = 8
+
+	var (
+		wg      sync.WaitGroup
+		results = make([]origin.ObjectInfo, n)
+		errs    = make([]error, n)
+	)
+
+	wg.Add(n)
+
+	for i := 0; i < n; i++ {
+		go func(i int) {
+			defer wg.Done()
+
+			results[i], errs[i] = c.LookupOrFetch(t.Context(), "origin", "bucket", "key", fetch)
+		}(i)
+	}
+
+	time.Sleep(50 * time.Millisecond) // let everyone arrive at the singleflight
+	close(gate)
+	wg.Wait()
+
+	if got := calls.Load(); got != 1 {
+		t.Errorf("fetch invoked %d times, want 1 (joiners must collapse)", got)
+	}
+
+	for i, err := range errs {
+		if err != nil {
+			t.Errorf("call %d: err=%v", i, err)
+		}
+
+		if results[i] != want {
+			t.Errorf("call %d: got %+v want %+v", i, results[i], want)
+		}
+	}
+}
+
+// TestMkKey_PipeCollisionResolved verifies that length-prefixed
+// encoding distinguishes (origin, bucket, key) triples that
+// previously aliased on the pipe-delimited concatenation.
+//
+// Under the old 'origin|bucket|key' shape, S3 object keys legally
+// containing '|' could produce key collisions across distinct
+// triples: ("a|b","c","d") and ("a","b|c","d") rendered to the
+// same string. The length-prefix encoding guarantees uniqueness.
+func TestMkKey_PipeCollisionResolved(t *testing.T) {
+	t.Parallel()
+
+	a := mkKey("a|b", "c", "d")
+	b := mkKey("a", "b|c", "d")
+
+	if a == b {
+		t.Errorf("pipe-delimited collision: mkKey(%q,%q,%q) == mkKey(%q,%q,%q) = %q",
+			"a|b", "c", "d", "a", "b|c", "d", a)
+	}
+}
+
+// TestNewCache_UsesInjectedLogger locks the contract that the
+// metadata cache uses the caller's logger rather than slog.Default.
+func TestNewCache_UsesInjectedLogger(t *testing.T) {
+	t.Parallel()
+
+	injected := slog.New(slog.NewTextHandler(io.Discard, nil))
+	c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, injected)
+
+	if c.log != injected {
+		t.Errorf("metadata.Cache.log not the injected logger")
+	}
+}
+
+// TestNewCache_NilLoggerFallsBackToDefault verifies the nil-logger
+// fallback so a misconfigured caller does not panic on the first
+// trace emission.
+func TestNewCache_NilLoggerFallsBackToDefault(t *testing.T) {
+	t.Parallel()
+
+	c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil)
+	if c.log == nil {
+		t.Errorf("nil logger should have fallen back to slog.Default()")
+	}
+}
+
+// TestLookupOrFetch_EmitsDebugTraces verifies that the metadata
+// cache emits the documented debug-level emissions on the leader,
+// joiner, hit, and record-result paths. The contract under test is
+// the named messages and the (origin_id, bucket, key) attribute
+// triple - operators rely on these for diagnosing cache-hit
+// patterns.
+func TestLookupOrFetch_EmitsDebugTraces(t *testing.T) {
+	t.Parallel()
+
+	var buf bytes.Buffer
+
+	log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}))
+	c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, log)
+
+	want := origin.ObjectInfo{Size: 42, ETag: "etag"}
+	// First call: leader path + positive record.
+	info, err := c.LookupOrFetch(context.Background(), "ox", "bkt", "obj",
+		func(_ context.Context) (origin.ObjectInfo, error) {
+			return want, nil
+		})
+	if err != nil || info.Size != 42 {
+		t.Fatalf("LookupOrFetch leader: info=%+v err=%v", info, err)
+	}
+	// Second call: cache hit path. The fetch function must not run.
+	_, err = c.LookupOrFetch(context.Background(), "ox", "bkt", "obj",
+		func(_ context.Context) (origin.ObjectInfo, error) {
+			t.Fatalf("fetch should not run on cache hit")
+			return origin.ObjectInfo{}, nil
+		})
+	if err != nil {
+		t.Fatalf("LookupOrFetch hit: %v", err)
+	}
+
+	out := buf.String()
+	for _, want := range []string{
+		"metadata_singleflight_leader",
+		"metadata_record",
+		"metadata_hit",
+		"bucket=bkt",
+		"key=obj",
+	} {
+		if !strings.Contains(out, want) {
+			t.Errorf("expected %q in debug output; got %q", want, out)
+		}
+	}
+}
diff --git a/internal/orca/origin/awss3/awss3.go b/internal/orca/origin/awss3/awss3.go
new file mode 100644
index 00000000..d803ced4
--- /dev/null
+++ b/internal/orca/origin/awss3/awss3.go
@@ -0,0 +1,378 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package awss3 is the AWS S3 (and S3-compatible) origin driver. It
+// targets either real AWS S3 or a local S3-compatible endpoint such as
+// LocalStack. Useful as a credential-free origin for the dev harness:
+// LocalStack acts as both origin and cachestore (different buckets).
+//
+// This driver is read-only from Orca's perspective (Head, GetRange,
+// List). The seed step that uploads test objects to the origin bucket
+// happens out-of-band via aws-cli or similar.
+package awss3
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/aws/aws-sdk-go-v2/aws"
+	awshttp "github.com/aws/aws-sdk-go-v2/aws/transport/http"
+	awsconfig "github.com/aws/aws-sdk-go-v2/config"
+	"github.com/aws/aws-sdk-go-v2/credentials"
+	"github.com/aws/aws-sdk-go-v2/service/s3"
+	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
+	"github.com/aws/smithy-go"
+
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// Adapter implements origin.Origin against an S3-compatible endpoint.
+type Adapter struct {
+	cfg    Config
+	client *s3.Client
+	log    *slog.Logger
+}
+
+// Config is the awss3-driver configuration. Mirrors config.AWSS3 but
+// kept package-local so the driver can be unit-tested without
+// importing the whole config package.
+type Config struct {
+	// Endpoint, when set, overrides the regional default and routes
+	// requests at a custom URL (LocalStack uses
+	// http://localstack:4566). Leave empty for real AWS S3.
+	Endpoint string
+
+	// Region is the AWS region. LocalStack ignores this; the SDK
+	// requires a value.
+	Region string
+
+	// Bucket is the source bucket holding origin objects.
+	Bucket string
+
+	// AccessKey / SecretKey are static credentials. For LocalStack
+	// these are "test"/"test"; for real AWS, supply real creds.
+	AccessKey string
+	SecretKey string
+
+	// UsePathStyle: true for LocalStack (host-based addressing
+	// requires DNS wildcards LocalStack does not provide).
+	UsePathStyle bool
+}
+
+// New constructs an Adapter. The log receives debug-level
+// emissions for every Head / GetRange / List call and the error
+// mapping decision (not-found / auth / precondition) on failure
+// paths. Passing nil falls back to slog.Default().
+func New(ctx context.Context, cfg Config, log *slog.Logger) (*Adapter, error) {
+	if cfg.Bucket == "" {
+		return nil, fmt.Errorf("origin/awss3: bucket required")
+	}
+
+	if cfg.Region == "" {
+		cfg.Region = "us-east-1"
+	}
+
+	awsCfg, err := awsconfig.LoadDefaultConfig(ctx,
+		awsconfig.WithRegion(cfg.Region),
+		awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(
+			cfg.AccessKey, cfg.SecretKey, "",
+		)),
+		// Opt out of CRC64NVME default introduced in aws-sdk-go-v2
+		// 1.32. LocalStack 3.8 returns InvalidRequest for unknown
+		// algorithms; real AWS S3 still works either way.
+		awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired),
+		awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("origin/awss3: aws config: %w", err)
+	}
+
+	client := s3.NewFromConfig(awsCfg, func(o *s3.Options) {
+		if cfg.Endpoint != "" {
+			o.BaseEndpoint = aws.String(cfg.Endpoint)
+		}
+
+		o.UsePathStyle = cfg.UsePathStyle
+	})
+
+	if log == nil {
+		log = slog.Default()
+	}
+
+	return &Adapter{cfg: cfg, client: client, log: log}, nil
+}
+
+// Head returns ObjectInfo for the named object. The bucket arg lets
+// callers override the configured bucket; if empty, the configured
+// bucket is used.
+func (a *Adapter) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	b := bucket
+	if b == "" {
+		b = a.cfg.Bucket
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_head_request",
+		slog.String("bucket", b),
+		slog.String("key", key),
+	)
+
+	out, err := a.client.HeadObject(ctx, &s3.HeadObjectInput{
+		Bucket: aws.String(b),
+		Key:    aws.String(key),
+	})
+	if err != nil {
+		if isNotFound(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_head_not_found",
+				slog.String("bucket", b),
+				slog.String("key", key),
+			)
+
+			return origin.ObjectInfo{LastStatus: http.StatusNotFound}, origin.ErrNotFound
+		}
+
+		if isAuth(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_head_auth",
+				slog.String("bucket", b),
+				slog.String("key", key),
+			)
+
+			return origin.ObjectInfo{}, origin.ErrAuth
+		}
+
+		return origin.ObjectInfo{}, fmt.Errorf("awss3 head: %w", err)
+	}
+
+	info := origin.ObjectInfo{LastStatus: http.StatusOK}
+	if out.ContentLength != nil {
+		info.Size = *out.ContentLength
+	}
+
+	if out.ETag != nil {
+		info.ETag = strings.Trim(*out.ETag, "\"")
+	}
+
+	if out.ContentType != nil {
+		info.ContentType = *out.ContentType
+	}
+
+	if out.LastModified != nil {
+		info.LastValidated = *out.LastModified
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_head_response",
+		slog.String("bucket", b),
+		slog.String("key", key),
+		slog.Int64("size", info.Size),
+		slog.String("etag", origin.ETagShort(info.ETag)),
+	)
+
+	return info, nil
+}
+
+// GetRange fetches [off, off+n) of the object, sending If-Match: <etag>.
+func (a *Adapter) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) {
+	b := bucket
+	if b == "" {
+		b = a.cfg.Bucket
+	}
+
+	rng := fmt.Sprintf("bytes=%d-%d", off, off+n-1)
+
+	in := &s3.GetObjectInput{
+		Bucket: aws.String(b),
+		Key:    aws.String(key),
+		Range:  aws.String(rng),
+	}
+	if etag != "" {
+		// S3 expects the etag wrapped in double quotes.
+		in.IfMatch = aws.String("\"" + etag + "\"")
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_get_range_request",
+		slog.String("bucket", b),
+		slog.String("key", key),
+		slog.String("etag", origin.ETagShort(etag)),
+		slog.Int64("off", off),
+		slog.Int64("n", n),
+	)
+
+	out, err := a.client.GetObject(ctx, in)
+	if err != nil {
+		if isPreconditionFailed(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_get_range_etag_changed",
+				slog.String("bucket", b),
+				slog.String("key", key),
+				slog.String("want_etag", origin.ETagShort(etag)),
+			)
+
+			return nil, &origin.OriginETagChangedError{
+				Bucket: b, Key: key, Want: etag,
+			}
+		}
+
+		if isNotFound(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_get_range_not_found",
+				slog.String("bucket", b),
+				slog.String("key", key),
+			)
+
+			return nil, origin.ErrNotFound
+		}
+
+		if isAuth(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_get_range_auth",
+				slog.String("bucket", b),
+				slog.String("key", key),
+			)
+
+			return nil, origin.ErrAuth
+		}
+
+		return nil, fmt.Errorf("awss3 get-range: %w", err)
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_get_range_response",
+		slog.String("bucket", b),
+		slog.String("key", key),
+	)
+
+	return out.Body, nil
+}
+
+// List enumerates objects under prefix.
+func (a *Adapter) List(ctx context.Context, bucket, prefix, marker string, maxResults int) (origin.ListResult, error) {
+	b := bucket
+	if b == "" {
+		b = a.cfg.Bucket
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_list_request",
+		slog.String("bucket", b),
+		slog.String("prefix", prefix),
+		slog.String("marker", marker),
+		slog.Int("max", maxResults),
+	)
+
+	in := &s3.ListObjectsV2Input{
+		Bucket:  aws.String(b),
+		Prefix:  aws.String(prefix),
+		MaxKeys: aws.Int32(int32(maxResults)),
+	}
+	if marker != "" {
+		in.ContinuationToken = aws.String(marker)
+	}
+
+	out, err := a.client.ListObjectsV2(ctx, in)
+	if err != nil {
+		if isAuth(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_list_auth",
+				slog.String("bucket", b),
+			)
+
+			return origin.ListResult{}, origin.ErrAuth
+		}
+
+		return origin.ListResult{}, fmt.Errorf("awss3 list: %w", err)
+	}
+
+	res := origin.ListResult{}
+
+	for _, item := range out.Contents {
+		entry := origin.ObjectEntry{}
+		if item.Key != nil {
+			entry.Key = *item.Key
+		}
+
+		if item.Size != nil {
+			entry.Size = *item.Size
+		}
+
+		if item.ETag != nil {
+			entry.ETag = strings.Trim(*item.ETag, "\"")
+		}
+
+		res.Entries = append(res.Entries, entry)
+	}
+
+	if out.IsTruncated != nil {
+		res.IsTruncated = *out.IsTruncated
+	}
+
+	if out.NextContinuationToken != nil {
+		res.NextMarker = *out.NextContinuationToken
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_list_response",
+		slog.String("bucket", b),
+		slog.Int("count", len(res.Entries)),
+		slog.Bool("truncated", res.IsTruncated),
+	)
+
+	return res, nil
+}
+
+func isNotFound(err error) bool {
+	var nsk *s3types.NoSuchKey
+	if errors.As(err, &nsk) {
+		return true
+	}
+
+	var nsb *s3types.NoSuchBucket
+	if errors.As(err, &nsb) {
+		return true
+	}
+
+	var notFound *s3types.NotFound
+	if errors.As(err, &notFound) {
+		return true
+	}
+
+	var respErr *awshttp.ResponseError
+	if errors.As(err, &respErr) && respErr.Response != nil &&
+		respErr.Response.StatusCode == http.StatusNotFound {
+		return true
+	}
+
+	return false
+}
+
+func isAuth(err error) bool {
+	var apiErr smithy.APIError
+	if errors.As(err, &apiErr) {
+		switch apiErr.ErrorCode() {
+		case "AccessDenied", "Unauthorized", "Forbidden", "InvalidAccessKeyId", "SignatureDoesNotMatch":
+			return true
+		}
+	}
+
+	var respErr *awshttp.ResponseError
+	if errors.As(err, &respErr) && respErr.Response != nil {
+		status := respErr.Response.StatusCode
+		if status == http.StatusUnauthorized || status == http.StatusForbidden {
+			return true
+		}
+	}
+
+	return false
+}
+
+// isPreconditionFailed reports whether err carries an HTTP 412
+// Precondition Failed response. Used to translate
+// If-Match-rejected GetRange calls into the orca-internal
+// OriginETagChangedError. We rely on the HTTP status code on the
+// underlying *awshttp.ResponseError rather than service error
+// codes; the status code is part of the stable wire contract
+// across SDK and backend versions.
+func isPreconditionFailed(err error) bool {
+	var respErr *awshttp.ResponseError
+	if errors.As(err, &respErr) && respErr.Response != nil {
+		return respErr.Response.StatusCode == http.StatusPreconditionFailed
+	}
+
+	return false
+}
diff --git a/internal/orca/origin/awss3/awss3_test.go b/internal/orca/origin/awss3/awss3_test.go
new file mode 100644
index 00000000..ac8fd11f
--- /dev/null
+++ b/internal/orca/origin/awss3/awss3_test.go
@@ -0,0 +1,125 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package awss3
+
+import (
+	"errors"
+	"net/http"
+	"testing"
+
+	awshttp "github.com/aws/aws-sdk-go-v2/aws/transport/http"
+	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
+	smithy "github.com/aws/smithy-go"
+	smithyhttp "github.com/aws/smithy-go/transport/http"
+)
+
+// makeResponseErr builds an *awshttp.ResponseError wrapping the
+// given HTTP status code. Mirrors how the AWS SDK surfaces service
+// errors to callers.
+func makeResponseErr(status int, inner error) *awshttp.ResponseError {
+	return &awshttp.ResponseError{
+		ResponseError: &smithyhttp.ResponseError{
+			Response: &smithyhttp.Response{
+				Response: &http.Response{StatusCode: status},
+			},
+			Err: inner,
+		},
+	}
+}
+
+// fakeAPIError implements smithy.APIError for testing service-code
+// matching paths (AccessDenied / typed-not-found etc).
+type fakeAPIError struct{ code string }
+
+func (e *fakeAPIError) Error() string                 { return e.code }
+func (e *fakeAPIError) ErrorCode() string             { return e.code }
+func (e *fakeAPIError) ErrorMessage() string          { return e.code }
+func (e *fakeAPIError) ErrorFault() smithy.ErrorFault { return smithy.FaultUnknown }
+func (e *fakeAPIError) HTTPStatusCode() int           { return 0 }
+
+// TestIsPreconditionFailed_FromHTTPStatus verifies that only an HTTP
+// 412 response satisfies the predicate. The previous implementation
+// matched service codes 'PreconditionFailed' and
+// 'ConditionalRequestConflict' plus a substring fallback on
+// err.Error(), which was both incomplete (didn't cover backends
+// returning only the status) and fragile (false positives on
+// arbitrary error messages containing '412').
+func TestIsPreconditionFailed_FromHTTPStatus(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		err  error
+		want bool
+	}{
+		{"412 ResponseError -> true", makeResponseErr(412, errors.New("precondition")), true},
+		{"500 ResponseError -> false", makeResponseErr(500, errors.New("ise")), false},
+		{"404 ResponseError -> false", makeResponseErr(404, errors.New("not found")), false},
+		{"plain error -> false", errors.New("StatusCode: 412 something"), false},
+		{"nil -> false", nil, false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isPreconditionFailed(tt.err); got != tt.want {
+				t.Errorf("isPreconditionFailed = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+// TestIsNotFound covers the typed-error and HTTP-status branches.
+func TestIsNotFound(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		err  error
+		want bool
+	}{
+		{"NoSuchKey typed", &s3types.NoSuchKey{}, true},
+		{"NoSuchBucket typed", &s3types.NoSuchBucket{}, true},
+		{"NotFound typed", &s3types.NotFound{}, true},
+		{"404 ResponseError", makeResponseErr(404, errors.New("nf")), true},
+		{"500 ResponseError", makeResponseErr(500, errors.New("ise")), false},
+		{"plain error", errors.New("random"), false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isNotFound(tt.err); got != tt.want {
+				t.Errorf("isNotFound = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+// TestIsAuth covers both the typed APIError branch and the HTTP
+// 401/403 status branch.
+func TestIsAuth(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		err  error
+		want bool
+	}{
+		{"AccessDenied APIError", &fakeAPIError{code: "AccessDenied"}, true},
+		{"InvalidAccessKeyId APIError", &fakeAPIError{code: "InvalidAccessKeyId"}, true},
+		{"SignatureDoesNotMatch APIError", &fakeAPIError{code: "SignatureDoesNotMatch"}, true},
+		{"403 ResponseError", makeResponseErr(403, errors.New("denied")), true},
+		{"401 ResponseError", makeResponseErr(401, errors.New("unauth")), true},
+		{"404 ResponseError", makeResponseErr(404, errors.New("nf")), false},
+		{"500 ResponseError", makeResponseErr(500, errors.New("ise")), false},
+		{"plain error", errors.New("auth?"), false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isAuth(tt.err); got != tt.want {
+				t.Errorf("isAuth = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
diff --git a/internal/orca/origin/azureblob/azureblob.go b/internal/orca/origin/azureblob/azureblob.go
new file mode 100644
index 00000000..89406ed9
--- /dev/null
+++ b/internal/orca/origin/azureblob/azureblob.go
@@ -0,0 +1,369 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package azureblob is the Azure Blob Storage adapter for the Origin
+// interface. Block Blobs only; PageBlob and AppendBlob are rejected
+// at Head() with UnsupportedBlobTypeError.
+package azureblob
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"strings"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/azcore"
+	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/bloberror"
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container"
+
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// Adapter implements origin.Origin against Azure Blob Storage.
+type Adapter struct {
+	cfg    config.Azureblob
+	client *azblob.Client
+	log    *slog.Logger
+}
+
+// New builds an Adapter from config. The log receives debug-level
+// emissions for every Head / GetRange / List call and the error
+// mapping decision (not-found / auth / precondition / unsupported
+// blob type) on failure paths. Passing nil falls back to
+// slog.Default().
+func New(cfg config.Azureblob, log *slog.Logger) (*Adapter, error) {
+	if cfg.Account == "" {
+		return nil, fmt.Errorf("azureblob: account required")
+	}
+
+	if cfg.AccountKey == "" {
+		return nil, fmt.Errorf("azureblob: account_key required")
+	}
+
+	cred, err := azblob.NewSharedKeyCredential(cfg.Account, cfg.AccountKey)
+	if err != nil {
+		return nil, fmt.Errorf("azureblob: shared-key credential: %w", err)
+	}
+
+	endpoint := cfg.Endpoint
+	if endpoint == "" {
+		endpoint = fmt.Sprintf("https://%s.blob.core.windows.net/", cfg.Account)
+	}
+
+	client, err := azblob.NewClientWithSharedKeyCredential(endpoint, cred, nil)
+	if err != nil {
+		return nil, fmt.Errorf("azureblob: client: %w", err)
+	}
+
+	if log == nil {
+		log = slog.Default()
+	}
+
+	return &Adapter{cfg: cfg, client: client, log: log}, nil
+}
+
+// Head returns ObjectInfo for the named blob.
+//
+// "bucket" maps to the configured container; the bucket arg is honored
+// only if non-empty (allowing single-container deployments to use the
+// configured container as the default).
+func (a *Adapter) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	cName := bucket
+	if cName == "" {
+		cName = a.cfg.Container
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_head_request",
+		slog.String("container", cName),
+		slog.String("key", key),
+	)
+
+	props, err := a.client.ServiceClient().NewContainerClient(cName).
+		NewBlobClient(key).GetProperties(ctx, nil)
+	if err != nil {
+		if isNotFound(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_head_not_found",
+				slog.String("container", cName),
+				slog.String("key", key),
+			)
+
+			return origin.ObjectInfo{LastStatus: http.StatusNotFound}, origin.ErrNotFound
+		}
+
+		if isAuth(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_head_auth",
+				slog.String("container", cName),
+				slog.String("key", key),
+			)
+
+			return origin.ObjectInfo{}, origin.ErrAuth
+		}
+
+		return origin.ObjectInfo{}, fmt.Errorf("azureblob head: %w", err)
+	}
+
+	if err := validateBlobType(cName, key, props.BlobType); err != nil {
+		a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_head_unsupported_blob_type",
+			slog.String("container", cName),
+			slog.String("key", key),
+		)
+
+		return origin.ObjectInfo{}, err
+	}
+
+	info := origin.ObjectInfo{LastStatus: http.StatusOK}
+	if props.ContentLength != nil {
+		info.Size = *props.ContentLength
+	}
+
+	if props.ETag != nil {
+		info.ETag = unwrapAzcoreETag(props.ETag)
+	}
+
+	if props.ContentType != nil {
+		info.ContentType = *props.ContentType
+	}
+
+	if props.LastModified != nil {
+		info.LastValidated = *props.LastModified
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_head_response",
+		slog.String("container", cName),
+		slog.String("key", key),
+		slog.Int64("size", info.Size),
+		slog.String("etag", origin.ETagShort(info.ETag)),
+	)
+
+	return info, nil
+}
+
+// GetRange fetches [off, off+n) of the blob, sending If-Match: <etag>.
+func (a *Adapter) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) {
+	cName := bucket
+	if cName == "" {
+		cName = a.cfg.Container
+	}
+
+	bc := a.client.ServiceClient().NewContainerClient(cName).NewBlobClient(key)
+	opts := &azblob.DownloadStreamOptions{
+		Range: blob.HTTPRange{Offset: off, Count: n},
+	}
+
+	if etag != "" {
+		// Azure (like S3) expects the entity-tag value in If-Match
+		// to be a quoted-string per RFC 7232. We strip the quotes
+		// on Head (a.cfg internal representation is unquoted) so
+		// re-wrap here at the point of egress, mirroring the
+		// awss3 driver.
+		etagVal := azcore.ETag("\"" + etag + "\"")
+		opts.AccessConditions = &blob.AccessConditions{
+			ModifiedAccessConditions: &blob.ModifiedAccessConditions{
+				IfMatch: to.Ptr(etagVal),
+			},
+		}
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_get_range_request",
+		slog.String("container", cName),
+		slog.String("key", key),
+		slog.String("etag", origin.ETagShort(etag)),
+		slog.Int64("off", off),
+		slog.Int64("n", n),
+	)
+
+	resp, err := bc.DownloadStream(ctx, opts)
+	if err != nil {
+		if isPreconditionFailed(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_get_range_etag_changed",
+				slog.String("container", cName),
+				slog.String("key", key),
+				slog.String("want_etag", origin.ETagShort(etag)),
+			)
+
+			return nil, &origin.OriginETagChangedError{
+				Bucket: cName, Key: key, Want: etag,
+			}
+		}
+
+		if isNotFound(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_get_range_not_found",
+				slog.String("container", cName),
+				slog.String("key", key),
+			)
+
+			return nil, origin.ErrNotFound
+		}
+
+		if isAuth(err) {
+			a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_get_range_auth",
+				slog.String("container", cName),
+				slog.String("key", key),
+			)
+
+			return nil, origin.ErrAuth
+		}
+
+		return nil, fmt.Errorf("azureblob get-range: %w", err)
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_get_range_response",
+		slog.String("container", cName),
+		slog.String("key", key),
+	)
+
+	return resp.Body, nil
+}
+
+// List enumerates blobs in the container matching prefix.
+func (a *Adapter) List(ctx context.Context, bucket, prefix, marker string, maxResults int) (origin.ListResult, error) {
+	cName := bucket
+	if cName == "" {
+		cName = a.cfg.Container
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_list_request",
+		slog.String("container", cName),
+		slog.String("prefix", prefix),
+		slog.String("marker", marker),
+		slog.Int("max", maxResults),
+	)
+
+	cc := a.client.ServiceClient().NewContainerClient(cName)
+	max := int32(maxResults)
+	pager := cc.NewListBlobsFlatPager(&container.ListBlobsFlatOptions{
+		Prefix:     &prefix,
+		MaxResults: &max,
+		Marker:     stringOrNil(marker),
+	})
+	out := origin.ListResult{}
+
+	if pager.More() {
+		page, err := pager.NextPage(ctx)
+		if err != nil {
+			if isAuth(err) {
+				a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_list_auth",
+					slog.String("container", cName),
+				)
+
+				return origin.ListResult{}, origin.ErrAuth
+			}
+
+			return origin.ListResult{}, fmt.Errorf("azureblob list: %w", err)
+		}
+
+		for _, item := range page.Segment.BlobItems {
+			entry := origin.ObjectEntry{}
+			if item.Name != nil {
+				entry.Key = *item.Name
+			}
+
+			if item.Properties != nil {
+				if item.Properties.ContentLength != nil {
+					entry.Size = *item.Properties.ContentLength
+				}
+
+				if item.Properties.ETag != nil {
+					entry.ETag = unwrapAzcoreETag(item.Properties.ETag)
+				}
+
+				if item.Properties.BlobType != nil {
+					entry.BlobType = string(*item.Properties.BlobType)
+				}
+			}
+
+			out.Entries = append(out.Entries, entry)
+		}
+
+		if page.NextMarker != nil {
+			out.NextMarker = *page.NextMarker
+			out.IsTruncated = *page.NextMarker != ""
+		}
+	}
+
+	a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_list_response",
+		slog.String("container", cName),
+		slog.Int("count", len(out.Entries)),
+		slog.Bool("truncated", out.IsTruncated),
+	)
+
+	return out, nil
+}
+
+func stringOrNil(s string) *string {
+	if s == "" {
+		return nil
+	}
+
+	return &s
+}
+
+func isNotFound(err error) bool {
+	return bloberror.HasCode(err, bloberror.BlobNotFound) ||
+		bloberror.HasCode(err, bloberror.ContainerNotFound) ||
+		errors.Is(err, origin.ErrNotFound)
+}
+
+func isAuth(err error) bool {
+	var rerr *azcore.ResponseError
+	if errors.As(err, &rerr) {
+		if rerr.StatusCode == http.StatusUnauthorized || rerr.StatusCode == http.StatusForbidden {
+			return true
+		}
+	}
+
+	return bloberror.HasCode(err, bloberror.AuthenticationFailed) ||
+		bloberror.HasCode(err, bloberror.AuthorizationFailure)
+}
+
+func isPreconditionFailed(err error) bool {
+	var rerr *azcore.ResponseError
+	if errors.As(err, &rerr) && rerr.StatusCode == http.StatusPreconditionFailed {
+		return true
+	}
+
+	return bloberror.HasCode(err, bloberror.ConditionNotMet)
+}
+
+// validateBlobType returns an UnsupportedBlobTypeError for any
+// non-Block-Blob type (Page or Append). PageBlob and AppendBlob's
+// random-access-mutation model is incompatible with orca's chunked
+// immutable cache contract, so they are unconditionally rejected
+// here. Extracted as a pure function so unit tests can cover the
+// branches without an Azurite round-trip.
+func validateBlobType(container, key string, blobType *blob.BlobType) error {
+	if blobType == nil {
+		return nil
+	}
+
+	if *blobType == blob.BlobTypeBlockBlob {
+		return nil
+	}
+
+	return &origin.UnsupportedBlobTypeError{
+		Bucket:   container,
+		Key:      key,
+		BlobType: string(*blobType),
+	}
+}
+
+// unwrapAzcoreETag normalises an *azcore.ETag from the Azure SDK
+// to the unquoted form orca uses internally. The Azure REST API
+// returns entity tags as quoted-strings per RFC 7232; the SDK
+// preserves the quotes, and orca strips them at the boundary so
+// later If-Match egress (which re-wraps via the awss3 / azureblob
+// drivers) doesn't double-quote.
+func unwrapAzcoreETag(e *azcore.ETag) string {
+	if e == nil {
+		return ""
+	}
+
+	return strings.Trim(string(*e), "\"")
+}
diff --git a/internal/orca/origin/azureblob/azureblob_test.go b/internal/orca/origin/azureblob/azureblob_test.go
new file mode 100644
index 00000000..20e5fccf
--- /dev/null
+++ b/internal/orca/origin/azureblob/azureblob_test.go
@@ -0,0 +1,201 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package azureblob
+
+import (
+	"context"
+	"encoding/base64"
+	"errors"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"sync/atomic"
+	"testing"
+
+	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
+
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// TestValidateBlobType covers every branch of the unconditional
+// block-blob-only enforcement. PageBlob and AppendBlob are always
+// rejected; BlockBlob and the nil/unknown response shape pass.
+func TestValidateBlobType(t *testing.T) {
+	pageBlob := blob.BlobTypePageBlob
+	appendBlob := blob.BlobTypeAppendBlob
+	blockBlob := blob.BlobTypeBlockBlob
+
+	tests := []struct {
+		name            string
+		blobType        *blob.BlobType
+		wantUnsupported bool
+	}{
+		{"nil blob type passes (no info to validate)", nil, false},
+		{"block blob accepted", &blockBlob, false},
+		{"page blob refused", &pageBlob, true},
+		{"append blob refused", &appendBlob, true},
+	}
+
+	const (
+		container = "ctr"
+		key       = "key"
+	)
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validateBlobType(container, key, tt.blobType)
+
+			if (err != nil) != tt.wantUnsupported {
+				t.Fatalf("err=%v, wantUnsupported=%v", err, tt.wantUnsupported)
+			}
+
+			if !tt.wantUnsupported {
+				return
+			}
+
+			var ube *origin.UnsupportedBlobTypeError
+			if !errors.As(err, &ube) {
+				t.Fatalf("err type=%T (want *origin.UnsupportedBlobTypeError): %v", err, err)
+			}
+
+			if ube.Bucket != container {
+				t.Errorf("Bucket=%q want %q", ube.Bucket, container)
+			}
+
+			if ube.Key != key {
+				t.Errorf("Key=%q want %q", ube.Key, key)
+			}
+
+			if tt.blobType != nil && ube.BlobType != string(*tt.blobType) {
+				t.Errorf("BlobType=%q want %q", ube.BlobType, string(*tt.blobType))
+			}
+		})
+	}
+}
+
+// TestValidateBlobType_NonBlockBlob_AlwaysRejected is the regression
+// test for the fix that removed the user-overridable
+// EnforceBlockBlobOnly flag. There is no longer any code path that
+// accepts a Page or Append blob.
+func TestValidateBlobType_NonBlockBlob_AlwaysRejected(t *testing.T) {
+	pageBlob := blob.BlobTypePageBlob
+
+	if err := validateBlobType("ctr", "key", &pageBlob); err == nil {
+		t.Fatalf("page blob accepted; want UnsupportedBlobTypeError")
+	}
+
+	appendBlob := blob.BlobTypeAppendBlob
+	if err := validateBlobType("ctr", "key", &appendBlob); err == nil {
+		t.Fatalf("append blob accepted; want UnsupportedBlobTypeError")
+	}
+}
+
+// TestGetRange_QuotesIfMatchHeader verifies that the If-Match header
+// emitted on a conditional GetRange is the etag value wrapped in
+// double quotes per RFC 7232. The internal representation strips
+// quotes on Head (drivers normalise to unquoted), so this is the
+// re-wrap point on egress. Without the wrap an upstream that
+// strictly enforces RFC 7232 entity-tag syntax would reject the
+// precondition or treat it as never-matched.
+func TestGetRange_QuotesIfMatchHeader(t *testing.T) {
+	t.Parallel()
+
+	const etagUnquoted = "0x8DDCAFE00000000"
+
+	var captured atomic.Value // string
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		captured.Store(r.Header.Get("If-Match"))
+		// Respond with the requested bytes. The exact body is not
+		// validated by this test - only the inbound If-Match header
+		// is. A small synthetic body keeps the SDK happy.
+		w.Header().Set("Content-Length", "4")
+		w.Header().Set("Content-Type", "application/octet-stream")
+		w.Header().Set("ETag", "\""+etagUnquoted+"\"")
+		w.WriteHeader(http.StatusPartialContent)
+		_, _ = w.Write([]byte("test")) //nolint:errcheck // best-effort test write
+	}))
+
+	t.Cleanup(srv.Close)
+	// Azurite uses the account name as the URL path component. We
+	// mirror that shape so the SDK signs/issues requests in the
+	// expected layout.
+	cfg := config.Azureblob{
+		Account:    "devstoreaccount1",
+		AccountKey: base64.StdEncoding.EncodeToString([]byte("test-shared-key-placeholder--32b")),
+		Container:  "ctr",
+		Endpoint:   srv.URL + "/devstoreaccount1",
+	}
+
+	a, err := New(cfg, nil)
+	if err != nil {
+		t.Fatalf("azureblob.New: %v", err)
+	}
+
+	body, err := a.GetRange(context.Background(), "ctr", "key", etagUnquoted, 0, 4)
+	if err != nil {
+		t.Fatalf("GetRange: %v", err)
+	}
+
+	defer body.Close() //nolint:errcheck // test cleanup
+
+	if _, err := io.ReadAll(body); err != nil {
+		t.Fatalf("read body: %v", err)
+	}
+
+	got, _ := captured.Load().(string)
+
+	want := "\"" + etagUnquoted + "\""
+	if got != want {
+		t.Errorf("If-Match=%q want %q", got, want)
+	}
+}
+
+// TestGetRange_OmitsIfMatchWhenEtagEmpty verifies that the If-Match
+// header is not sent at all when the caller supplies an empty etag.
+// Sending an empty If-Match would either be a malformed precondition
+// or evaluate as never-matching depending on server interpretation.
+func TestGetRange_OmitsIfMatchWhenEtagEmpty(t *testing.T) {
+	t.Parallel()
+
+	var captured atomic.Value // string
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Record presence/absence; empty string here means "header
+		// was absent".
+		captured.Store(r.Header.Get("If-Match"))
+		w.Header().Set("Content-Length", "4")
+		w.WriteHeader(http.StatusPartialContent)
+		_, _ = w.Write([]byte("test")) //nolint:errcheck // best-effort test write
+	}))
+
+	t.Cleanup(srv.Close)
+
+	cfg := config.Azureblob{
+		Account:    "devstoreaccount1",
+		AccountKey: base64.StdEncoding.EncodeToString([]byte("test-shared-key-placeholder--32b")),
+		Container:  "ctr",
+		Endpoint:   srv.URL + "/devstoreaccount1",
+	}
+
+	a, err := New(cfg, nil)
+	if err != nil {
+		t.Fatalf("azureblob.New: %v", err)
+	}
+
+	body, err := a.GetRange(context.Background(), "ctr", "key", "", 0, 4)
+	if err != nil {
+		t.Fatalf("GetRange: %v", err)
+	}
+
+	defer body.Close() //nolint:errcheck // test cleanup
+
+	_, _ = io.ReadAll(body) //nolint:errcheck // test cleanup
+
+	got, _ := captured.Load().(string)
+	if got != "" {
+		t.Errorf("If-Match present (%q) when etag was empty; want absent", got)
+	}
+}
diff --git a/internal/orca/origin/origin.go b/internal/orca/origin/origin.go
new file mode 100644
index 00000000..326c8884
--- /dev/null
+++ b/internal/orca/origin/origin.go
@@ -0,0 +1,133 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package origin defines the upstream-blob-store interface and shared
+// types. Concrete adapters live under origin/<driver>/.
+package origin
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"time"
+)
+
+// Origin is a read-only view of an upstream blob store.
+type Origin interface {
+	// Head returns object metadata. If the blob does not exist, returns
+	// ErrNotFound. If the blob is an unsupported type (e.g., azureblob
+	// non-BlockBlob), returns UnsupportedBlobTypeError.
+	Head(ctx context.Context, bucket, key string) (ObjectInfo, error)
+
+	// GetRange fetches [off, off+n) bytes of the object. The etag is
+	// passed as `If-Match: <etag>` so a mid-flight overwrite is detected
+	// at the wire (returns OriginETagChangedError).
+	GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error)
+
+	// List enumerates objects under prefix. Pagination via marker.
+	List(ctx context.Context, bucket, prefix, marker string, max int) (ListResult, error)
+}
+
+// ObjectInfo is the result of a successful Head.
+type ObjectInfo struct {
+	Size          int64
+	ETag          string
+	ContentType   string
+	LastValidated time.Time
+	LastStatus    int
+}
+
+// ListResult is the paginated result of List.
+type ListResult struct {
+	Entries     []ObjectEntry
+	NextMarker  string
+	IsTruncated bool
+}
+
+// ObjectEntry is one item in a ListResult.
+type ObjectEntry struct {
+	Key      string
+	Size     int64
+	ETag     string
+	BlobType string // "" for s3; "BlockBlob" / "PageBlob" / "AppendBlob" for azureblob
+}
+
+// Sentinel errors. Wrap with %w so callers use errors.Is.
+//
+// Driver contract:
+//
+//   - ErrNotFound: blob does not exist. AWS S3 driver returns this for
+//     NoSuchKey responses; the azureblob driver for BlobNotFound /
+//     ContainerNotFound.
+//   - ErrAuth: 401 / 403. AWS S3 driver returns this for AccessDenied
+//     and similar; the azureblob driver for HTTP 401/403 and the
+//     AuthenticationFailed / AuthorizationFailure codes.
+//
+// New drivers should map their SDK-specific not-found and auth
+// indicators onto these sentinels so handlers can route consistently
+// via errors.Is.
+var (
+	ErrNotFound = errors.New("origin: not found")
+	ErrAuth     = errors.New("origin: auth")
+)
+
+// OriginETagChangedError is returned by GetRange when the origin
+// rejects the If-Match precondition.
+type OriginETagChangedError struct {
+	Bucket string
+	Key    string
+	Want   string
+}
+
+func (e *OriginETagChangedError) Error() string {
+	return fmt.Sprintf("origin etag changed for %s/%s: want=%q",
+		e.Bucket, e.Key, e.Want)
+}
+
+// UnsupportedBlobTypeError is returned by azureblob.Head when the
+// target is a Page or Append blob. Orca only serves Block Blobs.
+type UnsupportedBlobTypeError struct {
+	Bucket   string
+	Key      string
+	BlobType string
+}
+
+func (e *UnsupportedBlobTypeError) Error() string {
+	return fmt.Sprintf("origin unsupported blob type %s for %s/%s",
+		e.BlobType, e.Bucket, e.Key)
+}
+
+// MissingETagError is returned by the fetch coordinator when an
+// origin Head response carries an empty ETag. chunk.Path encodes the
+// ETag in its hash input; a stable cache key requires the origin to
+// supply one. Misconfigured backends (some S3-compatible
+// implementations with specific bucket policies, custom origins not
+// following the AWS/Azure contract) can omit ETags, in which case
+// two different versions of the same (bucket, key) would alias to
+// the same chunk.Path and orca would silently serve stale bytes.
+// Rejecting at Head time surfaces the misconfiguration immediately
+// instead of after observable corruption.
+type MissingETagError struct {
+	Bucket string
+	Key    string
+}
+
+func (e *MissingETagError) Error() string {
+	return fmt.Sprintf("origin returned empty ETag for %s/%s; orca requires versioned origins",
+		e.Bucket, e.Key)
+}
+
+// ETagShort returns the first 8 characters of an unquoted ETag for
+// log/debug emissions. ETags are not secrets but they're long enough
+// to make log lines hard to read; the prefix is sufficient for
+// matching one fill against another. Returns the input unchanged
+// when shorter than 8 chars.
+func ETagShort(etag string) string {
+	const n = 8
+	if len(etag) <= n {
+		return etag
+	}
+
+	return etag[:n]
+}
diff --git a/internal/orca/origin/origin_test.go b/internal/orca/origin/origin_test.go
new file mode 100644
index 00000000..f9f1f85d
--- /dev/null
+++ b/internal/orca/origin/origin_test.go
@@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package origin
+
+import "testing"
+
+// TestETagShort covers the truncation contract: ETags 8 characters or
+// shorter pass through unchanged; longer ETags are truncated to the
+// first 8 characters. The truncation is for log-line readability only;
+// callers must not use the short form as a precondition value.
+func TestETagShort(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		in   string
+		want string
+	}{
+		{"", ""},
+		{"abc", "abc"},
+		{"01234567", "01234567"},
+		{"012345678", "01234567"},
+		{"0x8DDCAFE00000000ABCDEF", "0x8DDCAF"},
+	}
+
+	for _, tt := range tests {
+		got := ETagShort(tt.in)
+		if got != tt.want {
+			t.Errorf("ETagShort(%q) = %q, want %q", tt.in, got, tt.want)
+		}
+	}
+}
diff --git a/internal/orca/server/server.go b/internal/orca/server/server.go
new file mode 100644
index 00000000..1cf51d8c
--- /dev/null
+++ b/internal/orca/server/server.go
@@ -0,0 +1,975 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Package server holds the HTTP handlers for the client edge and the
+// internal-listener.
+//
+// Client edge (8443): GET /{bucket}/{key} (with optional Range), HEAD,
+// LIST. No auth in dev (server.auth.enabled=false).
+//
+// Internal listener (8444): GET /internal/fill?<chunk-key>. No mTLS in
+// dev (cluster.internal_tls.enabled=false).
+package server
+
+import (
+	"bufio"
+	"context"
+	"encoding/xml"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"strconv"
+	"strings"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// EdgeHandler implements the client-edge S3 surface.
+type EdgeHandler struct {
+	fc  edgeFetchAPI
+	cfg *config.Config
+	log *slog.Logger
+}
+
+// edgeFetchAPI is the surface area EdgeHandler depends on. The real
+// *fetch.Coordinator satisfies it; tests substitute small fakes for
+// deterministic unit-level coverage.
+type edgeFetchAPI interface {
+	HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error)
+	GetChunk(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error)
+	Origin() origin.Origin
+}
+
+// NewEdgeHandler wires the edge handler.
+func NewEdgeHandler(fc edgeFetchAPI, cfg *config.Config, log *slog.Logger) *EdgeHandler {
+	return &EdgeHandler{fc: fc, cfg: cfg, log: log}
+}
+
+// ServeHTTP routes incoming client requests.
+//
+// Routing (path-style only, since LocalStack and most dev clients
+// use path-style):
+//
+//	GET  /                                  -> ListBuckets (not supported; 405)
+//	GET  /{bucket}/?list-type=2&prefix=...  -> ListObjectsV2
+//	GET  /{bucket}/                         -> ListObjectsV2 (default)
+//	GET  /{bucket}/{key}                    -> GetObject (with optional Range)
+//	HEAD /{bucket}/{key}                    -> HeadObject
+//	HEAD /{bucket}/                         -> HeadBucket (not supported; 405)
+func (h *EdgeHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	if h.cfg.Server.Auth.Enabled {
+		// Stub: production would dispatch to bearer/mTLS validation.
+		// In dev (auth.enabled=false) we skip entirely.
+		http.Error(w, "auth required (server.auth.enabled=true) but not implemented in MVP",
+			http.StatusUnauthorized)
+
+		return
+	}
+
+	bucket, key := splitPath(r.URL.Path)
+
+	h.log.LogAttrs(r.Context(), slog.LevelDebug, "edge_request",
+		slog.String("method", r.Method),
+		slog.String("path", r.URL.Path),
+		slog.String("bucket", bucket),
+		slog.String("key", key),
+		slog.String("range", r.Header.Get("Range")),
+		slog.String("remote", r.RemoteAddr),
+	)
+
+	switch r.Method {
+	case http.MethodHead:
+		if key == "" {
+			h.notImplemented(w, "HeadBucket")
+			return
+		}
+
+		h.handleHead(w, r, bucket, key)
+	case http.MethodGet:
+		if key == "" {
+			h.handleList(w, r, bucket)
+			return
+		}
+
+		h.handleGet(w, r, bucket, key)
+	default:
+		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+	}
+}
+
+func (h *EdgeHandler) handleHead(w http.ResponseWriter, r *http.Request, bucket, key string) {
+	info, err := h.fc.HeadObject(r.Context(), bucket, key)
+	if err != nil {
+		h.writeOriginError(w, err)
+		return
+	}
+
+	h.log.LogAttrs(r.Context(), slog.LevelDebug, "edge_head_response",
+		slog.String("bucket", bucket),
+		slog.String("key", key),
+		slog.Int64("size", info.Size),
+		slog.String("etag", origin.ETagShort(info.ETag)),
+	)
+
+	setObjectHeaders(w, info)
+	// HEAD must report the Content-Length the GET response would carry.
+	w.Header().Set("Content-Length", strconv.FormatInt(info.Size, 10))
+	w.WriteHeader(http.StatusOK)
+}
+
+func (h *EdgeHandler) handleGet(w http.ResponseWriter, r *http.Request, bucket, key string) {
+	info, err := h.fc.HeadObject(r.Context(), bucket, key)
+	if err != nil {
+		h.writeOriginError(w, err)
+		return
+	}
+
+	// Zero-byte objects short-circuit to 200 + empty body. The normal
+	// flow below would compute rangeEnd = info.Size - 1 = -1 and fall
+	// into the rangeStart > rangeEnd guard, returning a spurious 416
+	// for what should be a successful empty-body fetch. Any Range
+	// request against a zero-byte object is genuinely unsatisfiable
+	// and remains a 416 (RFC 7233).
+	if info.Size == 0 {
+		if r.Header.Get("Range") != "" {
+			http.Error(w, "range not satisfiable", http.StatusRequestedRangeNotSatisfiable)
+			return
+		}
+
+		setObjectHeaders(w, info)
+		w.Header().Set("Content-Length", "0")
+		w.WriteHeader(http.StatusOK)
+
+		h.log.LogAttrs(r.Context(), slog.LevelDebug, "edge_get_empty_object",
+			slog.String("bucket", bucket),
+			slog.String("key", key),
+		)
+
+		return
+	}
+
+	// Determine byte range.
+	var (
+		rangeStart int64
+		rangeEnd   = info.Size - 1
+		hasRange   bool
+		statusCode = http.StatusOK
+	)
+	if rh := r.Header.Get("Range"); rh != "" {
+		s, e, ok := parseSimpleByteRange(rh, info.Size)
+		if !ok {
+			http.Error(w, "invalid Range", http.StatusRequestedRangeNotSatisfiable)
+			return
+		}
+
+		rangeStart, rangeEnd = s, e
+		hasRange = true
+		statusCode = http.StatusPartialContent
+	}
+
+	if rangeStart > rangeEnd {
+		http.Error(w, "range not satisfiable", http.StatusRequestedRangeNotSatisfiable)
+		return
+	}
+
+	chunkSize := chunk.SizeFor(info.Size, h.cfg.Chunking.Size, h.cfg.Chunking.AsChunkTiers())
+	firstChunk, lastChunk := chunk.IndexRange(rangeStart, rangeEnd, chunkSize, info.Size)
+
+	h.log.LogAttrs(r.Context(), slog.LevelDebug, "edge_get_plan",
+		slog.String("bucket", bucket),
+		slog.String("key", key),
+		slog.Int64("range_start", rangeStart),
+		slog.Int64("range_end", rangeEnd),
+		slog.Int64("first_chunk", firstChunk),
+		slog.Int64("last_chunk", lastChunk),
+		slog.Int64("chunk_size", chunkSize),
+		slog.Bool("has_range", hasRange),
+	)
+
+	// Fetch the first chunk before committing any response headers
+	// so that origin errors (404, auth, timeout, mid-stream blob
+	// fault) surface as a clean S3-style error response instead of
+	// a half-written 200 followed by a dropped connection. Once the
+	// first byte is in hand we know the rest of the stream is
+	// "tentatively" healthy; subsequent chunk failures remain
+	// mid-stream aborts.
+	firstKey := chunk.Key{
+		OriginID:  h.cfg.Origin.ID,
+		Bucket:    bucket,
+		ObjectKey: key,
+		ETag:      info.ETag,
+		ChunkSize: chunkSize,
+		Index:     firstChunk,
+	}
+
+	firstBody, err := h.fc.GetChunk(r.Context(), firstKey, info.Size)
+	if err != nil {
+		h.writeOriginError(w, err)
+		return
+	}
+	// Peek a single byte to drain any first-read errors from the
+	// underlying body (e.g. cachestore-backed bodies can fail on the
+	// first network read). io.EOF on peek is acceptable for the
+	// degenerate empty-chunk case.
+	firstReader := bufio.NewReader(firstBody)
+	if _, err := firstReader.Peek(1); err != nil && !errors.Is(err, io.EOF) {
+		firstBody.Close() //nolint:errcheck // closing on error path
+		h.writeOriginError(w, err)
+
+		return
+	}
+
+	// Set headers eagerly. The response headers are committed below
+	// once the first chunk has been confirmed readable; thereafter
+	// any failure becomes a mid-stream abort.
+	setObjectHeaders(w, info)
+	w.Header().Set("Content-Length", strconv.FormatInt(rangeEnd-rangeStart+1, 10))
+
+	if hasRange {
+		w.Header().Set("Content-Range",
+			fmt.Sprintf("bytes %d-%d/%d", rangeStart, rangeEnd, info.Size))
+	}
+	// Write status now; subsequent failures become mid-stream aborts.
+	w.WriteHeader(statusCode)
+
+	// Stream the first chunk's slice. Any failure here is now a
+	// mid-stream abort (headers are committed).
+	off, length := chunk.ChunkSlice(firstChunk, chunkSize, rangeStart, rangeEnd, info.Size)
+	if err := streamSlice(w, firstReader, off, length); err != nil {
+		firstBody.Close() //nolint:errcheck // body close best-effort, response already streaming
+		h.log.LogAttrs(r.Context(), slog.LevelWarn, "mid-stream copy failed",
+			slog.String("bucket", bucket),
+			slog.String("key", key),
+			slog.Int64("chunk", firstChunk),
+			slog.Any("err", err),
+		)
+
+		return
+	}
+
+	firstBody.Close() //nolint:errcheck // body close best-effort, response already streaming
+
+	if f, ok := w.(http.Flusher); ok {
+		f.Flush()
+	}
+
+	if firstChunk < lastChunk {
+		h.streamRemainingChunks(r.Context(), w, bucket, key, info, chunkSize,
+			rangeStart, rangeEnd, firstChunk+1, lastChunk)
+	}
+
+	h.log.LogAttrs(r.Context(), slog.LevelDebug, "edge_get_complete",
+		slog.String("bucket", bucket),
+		slog.String("key", key),
+		slog.Int64("bytes", rangeEnd-rangeStart+1),
+	)
+}
+
+// streamRemainingChunks fetches and streams chunks [firstIdx, lastIdx]
+// after the first chunk has already been delivered. Honors the
+// configured Chunking.Readahead depth: with depth > 0 a producer
+// goroutine prefetches up to depth chunks while the consumer streams
+// the current one; with depth == 0 the loop is strictly sequential
+// (zero-overhead opt-out preserving the pre-readahead behavior).
+//
+// All failures here are mid-stream aborts: response headers are
+// already committed, so the only remedy is logging and returning.
+func (h *EdgeHandler) streamRemainingChunks(
+	ctx context.Context,
+	w http.ResponseWriter,
+	bucket, key string,
+	info origin.ObjectInfo,
+	chunkSize, rangeStart, rangeEnd int64,
+	firstIdx, lastIdx int64,
+) {
+	depth := h.cfg.Chunking.ReadaheadDepth()
+	if depth <= 0 {
+		h.streamRemainingChunksSequential(ctx, w, bucket, key, info, chunkSize,
+			rangeStart, rangeEnd, firstIdx, lastIdx)
+
+		return
+	}
+
+	h.streamRemainingChunksReadahead(ctx, w, bucket, key, info, chunkSize,
+		rangeStart, rangeEnd, firstIdx, lastIdx, depth)
+}
+
+// streamRemainingChunksSequential is the pre-readahead loop body:
+// fetch chunk N, stream it, close it, advance. One in-flight chunk
+// fetch at a time. Used when Chunking.Readahead is 0.
+func (h *EdgeHandler) streamRemainingChunksSequential(
+	ctx context.Context,
+	w http.ResponseWriter,
+	bucket, key string,
+	info origin.ObjectInfo,
+	chunkSize, rangeStart, rangeEnd int64,
+	firstIdx, lastIdx int64,
+) {
+	for ci := firstIdx; ci <= lastIdx; ci++ {
+		ckey := chunk.Key{
+			OriginID:  h.cfg.Origin.ID,
+			Bucket:    bucket,
+			ObjectKey: key,
+			ETag:      info.ETag,
+			ChunkSize: chunkSize,
+			Index:     ci,
+		}
+
+		h.log.LogAttrs(ctx, slog.LevelDebug, "edge_get_chunk_next",
+			slog.String("bucket", bucket),
+			slog.String("key", key),
+			slog.Int64("chunk", ci),
+		)
+
+		body, err := h.fc.GetChunk(ctx, ckey, info.Size)
+		if err != nil {
+			h.log.LogAttrs(ctx, slog.LevelWarn, "mid-stream chunk fetch failed",
+				slog.String("bucket", bucket),
+				slog.String("key", key),
+				slog.Int64("chunk", ci),
+				slog.Any("err", err),
+			)
+
+			return
+		}
+
+		off, length := chunk.ChunkSlice(ci, chunkSize, rangeStart, rangeEnd, info.Size)
+		if err := streamSlice(w, body, off, length); err != nil {
+			body.Close() //nolint:errcheck // chunk body close best-effort, response already streaming
+			h.log.LogAttrs(ctx, slog.LevelWarn, "mid-stream copy failed",
+				slog.String("bucket", bucket),
+				slog.String("key", key),
+				slog.Int64("chunk", ci),
+				slog.Any("err", err),
+			)
+
+			return
+		}
+
+		body.Close() //nolint:errcheck // chunk body close best-effort, response already streaming
+
+		if f, ok := w.(http.Flusher); ok {
+			f.Flush()
+		}
+	}
+}
+
+// pendingChunk is one item produced by the readahead pipeline: an
+// in-order chunk body (or the error that prevented fetching it).
+// The consumer is responsible for Close()ing rc when non-nil.
+type pendingChunk struct {
+	idx int64
+	rc  io.ReadCloser
+	err error
+}
+
+// readaheadJob is a chunk-fetch slot held in the dispatcher's queue.
+// Each job owns a 1-buffered result channel that its worker writes
+// to exactly once before exiting.
+type readaheadJob struct {
+	idx int64
+	rc  chan pendingChunk
+}
+
+// streamRemainingChunksReadahead runs a producer goroutine that
+// fetches chunks ahead into a bounded channel of capacity depth,
+// while the main goroutine streams the current chunk to the client.
+// This hides per-chunk cachestore RTT behind body transfer time so
+// large-blob GETs no longer pay N strictly-serial round trips.
+//
+// Lifecycle:
+//   - Consumer aborts (mid-stream copy failure, fetch error,
+//     producer-channel closed early) cancel the producer's context;
+//     the producer drains and closes any bodies it has already
+//     prefetched on the way out.
+//   - Producer panics are recovered, logged, and surface to the
+//     consumer as an early channel close; the consumer treats that
+//     as a mid-stream abort and returns cleanly.
+//   - Context cancellation from the caller (client disconnect)
+//     propagates through prefetchCtx, cancelling in-flight
+//     GetChunk calls and causing the producer to exit.
+func (h *EdgeHandler) streamRemainingChunksReadahead(
+	ctx context.Context,
+	w http.ResponseWriter,
+	bucket, key string,
+	info origin.ObjectInfo,
+	chunkSize, rangeStart, rangeEnd int64,
+	firstIdx, lastIdx int64,
+	depth int,
+) {
+	prefetchCtx, cancelPrefetch := context.WithCancel(ctx)
+	defer cancelPrefetch()
+
+	ch := h.prefetchChunks(prefetchCtx, bucket, key, info.ETag, chunkSize, info.Size,
+		firstIdx, lastIdx, depth)
+
+	// Drain helper: close any pending bodies left in the channel
+	// after we decide to abort. The producer's own deferred
+	// per-pending close (on ctx cancel during send-select) covers
+	// the in-flight body it is currently fetching; this loop covers
+	// the buffered ones the consumer never reaches.
+	drain := func() {
+		for p := range ch {
+			if p.rc != nil {
+				_ = p.rc.Close() //nolint:errcheck // drain best-effort
+			}
+		}
+	}
+
+	expectedIdx := firstIdx
+
+	for p := range ch {
+		if p.err != nil {
+			if p.rc != nil {
+				_ = p.rc.Close() //nolint:errcheck // close error-path body
+			}
+
+			h.log.LogAttrs(ctx, slog.LevelWarn, "mid-stream chunk fetch failed",
+				slog.String("bucket", bucket),
+				slog.String("key", key),
+				slog.Int64("chunk", p.idx),
+				slog.Any("err", p.err),
+			)
+			cancelPrefetch()
+			drain()
+
+			return
+		}
+
+		if p.idx != expectedIdx {
+			// Defensive: producer is required to deliver chunks in
+			// index order. A mismatch indicates a programming error
+			// upstream; treat as mid-stream abort.
+			if p.rc != nil {
+				_ = p.rc.Close() //nolint:errcheck
+			}
+
+			h.log.LogAttrs(ctx, slog.LevelError, "readahead order violation",
+				slog.String("bucket", bucket),
+				slog.String("key", key),
+				slog.Int64("expected", expectedIdx),
+				slog.Int64("got", p.idx),
+			)
+			cancelPrefetch()
+			drain()
+
+			return
+		}
+
+		h.log.LogAttrs(ctx, slog.LevelDebug, "edge_get_chunk_next",
+			slog.String("bucket", bucket),
+			slog.String("key", key),
+			slog.Int64("chunk", p.idx),
+		)
+
+		off, length := chunk.ChunkSlice(p.idx, chunkSize, rangeStart, rangeEnd, info.Size)
+		if err := streamSlice(w, p.rc, off, length); err != nil {
+			_ = p.rc.Close() //nolint:errcheck
+			h.log.LogAttrs(ctx, slog.LevelWarn, "mid-stream copy failed",
+				slog.String("bucket", bucket),
+				slog.String("key", key),
+				slog.Int64("chunk", p.idx),
+				slog.Any("err", err),
+			)
+			cancelPrefetch()
+			drain()
+
+			return
+		}
+
+		_ = p.rc.Close() //nolint:errcheck
+
+		if f, ok := w.(http.Flusher); ok {
+			f.Flush()
+		}
+
+		expectedIdx++
+	}
+
+	if expectedIdx <= lastIdx {
+		// Channel closed before all chunks were delivered. The
+		// producer either panicked (already logged) or its context
+		// was cancelled (client disconnect or earlier mid-stream
+		// abort - the latter would have returned above). Surface as
+		// a mid-stream warning so operators see truncated responses.
+		h.log.LogAttrs(ctx, slog.LevelWarn, "readahead truncated response",
+			slog.String("bucket", bucket),
+			slog.String("key", key),
+			slog.Int64("expected_through", lastIdx),
+			slog.Int64("delivered_through", expectedIdx-1),
+		)
+	}
+}
+
+// prefetchChunks fetches chunks [firstIdx, lastIdx] into a bounded
+// channel of capacity depth, with up to depth fetches in flight in
+// parallel. Bodies are delivered in chunk-index order so the
+// consumer can stream them straight to the client without
+// reassembly. Caller drains the channel and owns Close() for any
+// non-nil rc it receives.
+//
+// Fan-out model:
+//   - A dispatcher goroutine spawns one worker goroutine per chunk
+//     index, gated by a depth-sized job queue so peak in-flight
+//     workers stays at depth (+ at most one in-flight push and one
+//     in-flight delivery).
+//   - Each worker calls h.fc.GetChunk for its chunk and writes the
+//     result to a per-job, 1-buffered result channel.
+//   - The dispatcher pushes job descriptors onto the queue in
+//     chunk-index order so the delivery loop reads results in that
+//     same order.
+//
+// Lifecycle:
+//   - All workers ALWAYS write exactly once to their result channel
+//     before exiting. This invariant lets the delivery loop block
+//     on `<-j.rc` without risk of deadlock even on ctx-cancel.
+//   - On ctx cancellation the dispatcher drains its currently-spawned
+//     worker (waiting for the unconditional rc write) and exits.
+//     The delivery loop drains any remaining queued jobs the same
+//     way, closing the body in each result.
+//   - Producer panics are recovered, logged, and surface to the
+//     consumer as an early channel close; the consumer treats that
+//     as a mid-stream abort.
+func (h *EdgeHandler) prefetchChunks(
+	ctx context.Context,
+	bucket, key, etag string,
+	chunkSize, objectSize int64,
+	firstIdx, lastIdx int64,
+	depth int,
+) <-chan pendingChunk {
+	out := make(chan pendingChunk, depth)
+
+	queue := make(chan readaheadJob, depth)
+
+	// Dispatcher: spawn workers in chunk-index order, gated by the
+	// queue's capacity. Each worker is independent and runs to
+	// completion (always writes its result), so the dispatcher
+	// doesn't need to track them after spawning.
+	go func() {
+		defer close(queue)
+		defer func() {
+			if r := recover(); r != nil {
+				h.log.LogAttrs(ctx, slog.LevelError, "readahead dispatcher panic",
+					slog.String("bucket", bucket),
+					slog.String("key", key),
+					slog.Any("panic", r),
+				)
+			}
+		}()
+
+		for ci := firstIdx; ci <= lastIdx; ci++ {
+			if err := ctx.Err(); err != nil {
+				return
+			}
+
+			rc := make(chan pendingChunk, 1)
+
+			// Spawn worker first so the result channel always
+			// receives a write, even if ctx is cancelled while we
+			// block on the queue push below. The worker's
+			// GetChunk call will short-circuit on a cancelled ctx
+			// with err != nil and rc == nil, satisfying the
+			// "always write" invariant.
+			go func(idx int64, rc chan<- pendingChunk) {
+				defer func() {
+					if r := recover(); r != nil {
+						h.log.LogAttrs(ctx, slog.LevelError, "readahead worker panic",
+							slog.String("bucket", bucket),
+							slog.String("key", key),
+							slog.Int64("chunk", idx),
+							slog.Any("panic", r),
+						)
+						// Preserve the write-once invariant: send a
+						// synthetic error so the delivery loop sees
+						// the panic-affected chunk as a fetch error
+						// rather than blocking forever on rc.
+						rc <- pendingChunk{idx: idx, err: fmt.Errorf("readahead worker panic: %v", r)}
+					}
+				}()
+
+				ckey := chunk.Key{
+					OriginID:  h.cfg.Origin.ID,
+					Bucket:    bucket,
+					ObjectKey: key,
+					ETag:      etag,
+					ChunkSize: chunkSize,
+					Index:     idx,
+				}
+
+				body, err := h.fc.GetChunk(ctx, ckey, objectSize)
+				rc <- pendingChunk{idx: idx, rc: body, err: err}
+			}(ci, rc)
+
+			select {
+			case queue <- readaheadJob{idx: ci, rc: rc}:
+			case <-ctx.Done():
+				// Worker is in flight; drain it so the body (if any)
+				// is closed and the goroutine doesn't leak.
+				p := <-rc
+				if p.rc != nil {
+					_ = p.rc.Close() //nolint:errcheck // ctx-cancel body close best-effort
+				}
+
+				return
+			}
+		}
+	}()
+
+	// Delivery: read worker results in chunk-index order and forward
+	// to `out`. Drains in-flight jobs on ctx-cancel.
+	go func() {
+		defer close(out)
+		defer func() {
+			if r := recover(); r != nil {
+				h.log.LogAttrs(ctx, slog.LevelError, "readahead delivery panic",
+					slog.String("bucket", bucket),
+					slog.String("key", key),
+					slog.Any("panic", r),
+				)
+			}
+		}()
+
+		for j := range queue {
+			p := <-j.rc // worker always writes; safe blocking read
+
+			if err := ctx.Err(); err != nil {
+				if p.rc != nil {
+					_ = p.rc.Close() //nolint:errcheck // drain best-effort
+				}
+
+				drainQueue(queue)
+
+				return
+			}
+
+			select {
+			case out <- p:
+			case <-ctx.Done():
+				if p.rc != nil {
+					_ = p.rc.Close() //nolint:errcheck // drain best-effort
+				}
+
+				drainQueue(queue)
+
+				return
+			}
+		}
+	}()
+
+	return out
+}
+
+// drainQueue is a helper that empties any remaining job descriptors
+// from the readahead queue, waits for each spawned worker to deliver
+// its result, and closes any body the result carries. Used on
+// ctx-cancel cleanup paths so worker goroutines and cachestore
+// response bodies do not leak when the consumer aborts mid-stream.
+func drainQueue(queue <-chan readaheadJob) {
+	for j := range queue {
+		p := <-j.rc
+		if p.rc != nil {
+			_ = p.rc.Close() //nolint:errcheck // cleanup best-effort
+		}
+	}
+}
+
+// streamSlice copies length bytes starting at off from src to dst.
+func streamSlice(dst io.Writer, src io.Reader, off, length int64) error {
+	if off > 0 {
+		if _, err := io.CopyN(io.Discard, src, off); err != nil {
+			return err
+		}
+	}
+
+	if length > 0 {
+		if _, err := io.CopyN(dst, src, length); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// handleList is a thin pass-through to Origin.List for v1 prototype.
+func (h *EdgeHandler) handleList(w http.ResponseWriter, r *http.Request, bucket string) {
+	// Pass-through; very minimal S3 ListObjectsV2 shape. Reviewers can
+	// curl this for sanity but full S3 list semantics are not in MVP.
+	prefix := r.URL.Query().Get("prefix")
+	marker := r.URL.Query().Get("continuation-token")
+	maxStr := r.URL.Query().Get("max-keys")
+	maxKeys := 1000
+
+	if maxStr != "" {
+		if v, err := strconv.Atoi(maxStr); err == nil && v > 0 {
+			maxKeys = v
+		}
+	}
+
+	type listEntry struct {
+		Key  string `xml:"Key"`
+		Size int64  `xml:"Size"`
+		ETag string `xml:"ETag"`
+	}
+
+	type listResult struct {
+		XMLName     xml.Name    `xml:"ListBucketResult"`
+		Name        string      `xml:"Name"`
+		Prefix      string      `xml:"Prefix"`
+		KeyCount    int         `xml:"KeyCount"`
+		MaxKeys     int         `xml:"MaxKeys"`
+		IsTruncated bool        `xml:"IsTruncated"`
+		NextMarker  string      `xml:"NextContinuationToken,omitempty"`
+		Contents    []listEntry `xml:"Contents"`
+	}
+
+	or := h.fc.Origin()
+
+	res, err := or.List(r.Context(), bucket, prefix, marker, maxKeys)
+	if err != nil {
+		h.writeOriginError(w, err)
+		return
+	}
+
+	body := listResult{
+		Name:        bucket,
+		Prefix:      prefix,
+		KeyCount:    len(res.Entries),
+		MaxKeys:     maxKeys,
+		IsTruncated: res.IsTruncated,
+		NextMarker:  res.NextMarker,
+	}
+	for _, e := range res.Entries {
+		body.Contents = append(body.Contents, listEntry{Key: e.Key, Size: e.Size, ETag: e.ETag})
+	}
+
+	w.Header().Set("Content-Type", "application/xml")
+	w.WriteHeader(http.StatusOK)
+	enc := xml.NewEncoder(w)
+
+	if err := enc.Encode(body); err != nil {
+		// Headers already sent; we cannot change the status. Log so
+		// truncated / malformed LIST responses are visible, matching
+		// the mid-stream warn-level treatment in the GET path.
+		h.log.LogAttrs(r.Context(), slog.LevelWarn, "list xml encode failed",
+			slog.String("bucket", bucket),
+			slog.String("prefix", prefix),
+			slog.Any("err", err),
+		)
+	}
+}
+
+func (h *EdgeHandler) notImplemented(w http.ResponseWriter, op string) {
+	http.Error(w, op+" not implemented in MVP", http.StatusNotImplemented)
+}
+
+func (h *EdgeHandler) writeOriginError(w http.ResponseWriter, err error) {
+	switch {
+	case errors.Is(err, origin.ErrNotFound):
+		http.Error(w, "NoSuchKey", http.StatusNotFound)
+	case errors.Is(err, origin.ErrAuth):
+		http.Error(w, "Unauthorized origin", http.StatusBadGateway)
+	default:
+		var (
+			ube *origin.UnsupportedBlobTypeError
+			ec  *origin.OriginETagChangedError
+			mte *origin.MissingETagError
+		)
+
+		switch {
+		case errors.As(err, &ube):
+			http.Error(w, "OriginUnsupported: "+ube.Error(), http.StatusBadGateway)
+		case errors.As(err, &ec):
+			http.Error(w, "OriginETagChanged", http.StatusBadGateway)
+		case errors.As(err, &mte):
+			http.Error(w, "OriginMissingETag: "+mte.Error(), http.StatusBadGateway)
+		default:
+			h.log.LogAttrs(context.Background(), slog.LevelWarn, "origin error",
+				slog.Any("err", err),
+			)
+			http.Error(w, "OriginUnreachable", http.StatusBadGateway)
+		}
+	}
+}
+
+func setObjectHeaders(w http.ResponseWriter, info origin.ObjectInfo) {
+	if info.ContentType != "" {
+		w.Header().Set("Content-Type", info.ContentType)
+	}
+
+	if info.ETag != "" {
+		w.Header().Set("ETag", "\""+info.ETag+"\"")
+	}
+
+	w.Header().Set("Accept-Ranges", "bytes")
+}
+
+func splitPath(p string) (bucket, key string) {
+	p = strings.TrimPrefix(p, "/")
+	if p == "" {
+		return "", ""
+	}
+
+	idx := strings.IndexByte(p, '/')
+	if idx < 0 {
+		return p, ""
+	}
+
+	return p[:idx], p[idx+1:]
+}
+
+func parseSimpleByteRange(h string, size int64) (start, end int64, ok bool) {
+	if !strings.HasPrefix(h, "bytes=") {
+		return 0, 0, false
+	}
+
+	spec := strings.TrimPrefix(h, "bytes=")
+
+	parts := strings.Split(spec, "-")
+	if len(parts) != 2 {
+		return 0, 0, false
+	}
+
+	if parts[0] == "" {
+		// Suffix: -N (last N bytes)
+		n, err := strconv.ParseInt(parts[1], 10, 64)
+		if err != nil || n <= 0 || n > size {
+			return 0, 0, false
+		}
+
+		return size - n, size - 1, true
+	}
+
+	s, err := strconv.ParseInt(parts[0], 10, 64)
+	if err != nil || s < 0 {
+		return 0, 0, false
+	}
+
+	if parts[1] == "" {
+		return s, size - 1, true
+	}
+
+	e, err := strconv.ParseInt(parts[1], 10, 64)
+	if err != nil || e < s {
+		return 0, 0, false
+	}
+
+	if e >= size {
+		e = size - 1
+	}
+
+	return s, e, true
+}
+
+// InternalHandler implements GET /internal/fill on the internal
+// listener. Plain HTTP/2 (no mTLS) in dev.
+type InternalHandler struct {
+	fc  internalFetchAPI
+	cl  *cluster.Cluster
+	log *slog.Logger
+}
+
+// internalFetchAPI is the surface area InternalHandler depends on. The
+// real *fetch.Coordinator satisfies it; tests substitute small fakes.
+type internalFetchAPI interface {
+	FillForPeer(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error)
+}
+
+// NewInternalHandler wires the internal handler.
+func NewInternalHandler(fc internalFetchAPI, cl *cluster.Cluster, log *slog.Logger) *InternalHandler {
+	return &InternalHandler{fc: fc, cl: cl, log: log}
+}
+
+// ServeHTTP handles GET /internal/fill?<chunk-key-params>.
+func (h *InternalHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	if r.URL.Path != "/internal/fill" {
+		http.NotFound(w, r)
+		return
+	}
+
+	if r.Method != http.MethodGet {
+		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	if r.Header.Get("X-Orca-Internal") != "1" {
+		http.Error(w, "missing X-Orca-Internal header", http.StatusBadRequest)
+		return
+	}
+
+	k, objectSize, err := cluster.DecodeChunkKey(r.URL.Query())
+	if err != nil {
+		http.Error(w, "invalid chunk key: "+err.Error(), http.StatusBadRequest)
+		return
+	}
+
+	h.log.LogAttrs(r.Context(), slog.LevelDebug, "internal_fill_request",
+		intChunkAttrs(k),
+		slog.Int64("object_size", objectSize),
+		slog.String("remote", r.RemoteAddr),
+	)
+
+	if !h.cl.IsCoordinator(k) {
+		h.log.LogAttrs(r.Context(), slog.LevelDebug, "internal_fill_not_coordinator",
+			intChunkAttrs(k),
+			slog.String("remote", r.RemoteAddr),
+		)
+		http.Error(w, `{"reason":"not_coordinator"}`, http.StatusConflict)
+
+		return
+	}
+
+	body, err := h.fc.FillForPeer(r.Context(), k, objectSize)
+	if err != nil {
+		h.log.LogAttrs(r.Context(), slog.LevelWarn, "internal fill failed",
+			intChunkAttrs(k),
+			slog.Any("err", err),
+		)
+		http.Error(w, "fill failed", http.StatusBadGateway)
+
+		return
+	}
+	defer body.Close() //nolint:errcheck // internal-fill body close best-effort
+
+	// Set Content-Length so the requesting peer can validate the
+	// streamed body length and detect mid-stream truncation. If the
+	// expected length is zero (unknown objectSize or empty chunk) we
+	// omit Content-Length; the requester then falls back to
+	// connection-close framing without length validation.
+	expectedLen := k.ExpectedLen(objectSize)
+	if expectedLen > 0 {
+		w.Header().Set("Content-Length", strconv.FormatInt(expectedLen, 10))
+	}
+
+	w.Header().Set("Content-Type", "application/octet-stream")
+	w.WriteHeader(http.StatusOK)
+
+	if _, copyErr := io.Copy(w, body); copyErr != nil {
+		h.log.LogAttrs(r.Context(), slog.LevelWarn, "internal fill copy failed",
+			intChunkAttrs(k),
+			slog.Any("err", copyErr),
+		)
+
+		return
+	}
+
+	h.log.LogAttrs(r.Context(), slog.LevelDebug, "internal_fill_complete",
+		intChunkAttrs(k),
+		slog.Int64("bytes", expectedLen),
+	)
+}
+
+// intChunkAttrs renders the chunk's identifying tuple as a slog
+// group attribute matching the cross-package 'chunk' taxonomy.
+func intChunkAttrs(k chunk.Key) slog.Attr {
+	return slog.Group("chunk",
+		slog.String("origin_id", k.OriginID),
+		slog.String("bucket", k.Bucket),
+		slog.String("key", k.ObjectKey),
+		slog.Int64("index", k.Index),
+	)
+}
diff --git a/internal/orca/server/server_test.go b/internal/orca/server/server_test.go
new file mode 100644
index 00000000..b95ccb51
--- /dev/null
+++ b/internal/orca/server/server_test.go
@@ -0,0 +1,1389 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package server
+
+import (
+	"bytes"
+	"context"
+	"encoding/xml"
+	"errors"
+	"io"
+	"log/slog"
+	"net/http"
+	"net/http/httptest"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/Azure/unbounded/internal/orca/chunk"
+	"github.com/Azure/unbounded/internal/orca/cluster"
+	"github.com/Azure/unbounded/internal/orca/config"
+	"github.com/Azure/unbounded/internal/orca/origin"
+)
+
+// fakeEdgeAPI satisfies edgeFetchAPI with canned responses for unit
+// tests. Only the field for the call you want to mock needs to be
+// set; an unset *Func panics if the test invokes the corresponding
+// method.
+type fakeEdgeAPI struct {
+	HeadObjectFunc func(ctx context.Context, bucket, key string) (origin.ObjectInfo, error)
+	GetChunkFunc   func(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error)
+	OriginVal      origin.Origin
+}
+
+func (f *fakeEdgeAPI) HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	return f.HeadObjectFunc(ctx, bucket, key)
+}
+
+func (f *fakeEdgeAPI) GetChunk(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) {
+	return f.GetChunkFunc(ctx, k, objectSize)
+}
+
+func (f *fakeEdgeAPI) Origin() origin.Origin { return f.OriginVal }
+
+// fakeOrigin satisfies origin.Origin for handler tests. Only the
+// fields used in the test need to be populated.
+type fakeOrigin struct {
+	HeadFunc     func(ctx context.Context, bucket, key string) (origin.ObjectInfo, error)
+	GetRangeFunc func(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error)
+	ListFunc     func(ctx context.Context, bucket, prefix, marker string, max int) (origin.ListResult, error)
+}
+
+func (f *fakeOrigin) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) {
+	return f.HeadFunc(ctx, bucket, key)
+}
+
+func (f *fakeOrigin) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) {
+	return f.GetRangeFunc(ctx, bucket, key, etag, off, n)
+}
+
+func (f *fakeOrigin) List(ctx context.Context, bucket, prefix, marker string, max int) (origin.ListResult, error) {
+	return f.ListFunc(ctx, bucket, prefix, marker, max)
+}
+
+// TestWriteOriginError covers all five branches of the error mapping.
+// Previously only ErrNotFound was exercised (via integration test).
+func TestWriteOriginError(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name       string
+		err        error
+		wantStatus int
+		wantBody   string
+	}{
+		{
+			name:       "not found",
+			err:        origin.ErrNotFound,
+			wantStatus: http.StatusNotFound,
+			wantBody:   "NoSuchKey",
+		},
+		{
+			name:       "auth",
+			err:        origin.ErrAuth,
+			wantStatus: http.StatusBadGateway,
+			wantBody:   "Unauthorized origin",
+		},
+		{
+			name: "unsupported blob type",
+			err: &origin.UnsupportedBlobTypeError{
+				Bucket:   "ctr",
+				Key:      "page-blob",
+				BlobType: "PageBlob",
+			},
+			wantStatus: http.StatusBadGateway,
+			wantBody:   "OriginUnsupported",
+		},
+		{
+			name: "etag changed",
+			err: &origin.OriginETagChangedError{
+				Bucket: "b", Key: "k", Want: "old",
+			},
+			wantStatus: http.StatusBadGateway,
+			wantBody:   "OriginETagChanged",
+		},
+		{
+			name:       "generic error",
+			err:        errors.New("unexpected"),
+			wantStatus: http.StatusBadGateway,
+			wantBody:   "OriginUnreachable",
+		},
+	}
+
+	h := &EdgeHandler{log: discardLogger()}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			rr := httptest.NewRecorder()
+			h.writeOriginError(rr, tt.err)
+
+			if rr.Code != tt.wantStatus {
+				t.Errorf("status=%d want %d", rr.Code, tt.wantStatus)
+			}
+
+			if !strings.Contains(rr.Body.String(), tt.wantBody) {
+				t.Errorf("body %q does not contain %q", rr.Body.String(), tt.wantBody)
+			}
+		})
+	}
+}
+
+// TestHandleHead covers metadata propagation and the not-found error
+// path on HEAD requests.
+func TestHandleHead(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name       string
+		info       origin.ObjectInfo
+		err        error
+		wantStatus int
+		wantHdrs   map[string]string
+	}{
+		{
+			name: "normal blob",
+			info: origin.ObjectInfo{
+				Size:        1024,
+				ETag:        "abc123",
+				ContentType: "application/octet-stream",
+			},
+			wantStatus: http.StatusOK,
+			wantHdrs: map[string]string{
+				"Content-Length": "1024",
+				"ETag":           `"abc123"`,
+				"Content-Type":   "application/octet-stream",
+			},
+		},
+		{
+			name:       "missing content type omits header",
+			info:       origin.ObjectInfo{Size: 99, ETag: "x"},
+			wantStatus: http.StatusOK,
+			wantHdrs: map[string]string{
+				"Content-Length": "99",
+				"ETag":           `"x"`,
+			},
+		},
+		{
+			name:       "missing etag omits header",
+			info:       origin.ObjectInfo{Size: 7},
+			wantStatus: http.StatusOK,
+			wantHdrs: map[string]string{
+				"Content-Length": "7",
+			},
+		},
+		{
+			name:       "origin not found yields 404",
+			err:        origin.ErrNotFound,
+			wantStatus: http.StatusNotFound,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			fc := &fakeEdgeAPI{
+				HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+					return tt.info, tt.err
+				},
+			}
+			h := NewEdgeHandler(fc, &config.Config{}, discardLogger())
+
+			req := httptest.NewRequest(http.MethodHead, "/bucket/key", nil)
+			rr := httptest.NewRecorder()
+			h.handleHead(rr, req, "bucket", "key")
+
+			if rr.Code != tt.wantStatus {
+				t.Errorf("status=%d want %d", rr.Code, tt.wantStatus)
+			}
+
+			for k, want := range tt.wantHdrs {
+				got := rr.Header().Get(k)
+				if got != want {
+					t.Errorf("header %s=%q want %q", k, got, want)
+				}
+			}
+
+			if rr.Body.Len() != 0 && tt.wantStatus == http.StatusOK {
+				t.Errorf("HEAD body should be empty; got %d bytes", rr.Body.Len())
+			}
+		})
+	}
+}
+
+// TestHandleList covers the XML pass-through, prefix propagation,
+// truncation, and empty-list handling.
+func TestHandleList(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name        string
+		prefix      string
+		listResult  origin.ListResult
+		listErr     error
+		wantStatus  int
+		wantKeys    []string
+		wantTrunc   bool
+		wantNextTok string
+	}{
+		{
+			name:   "normal list",
+			prefix: "alpha/",
+			listResult: origin.ListResult{
+				Entries: []origin.ObjectEntry{
+					{Key: "alpha/one", Size: 3, ETag: "e1"},
+					{Key: "alpha/two", Size: 5, ETag: "e2"},
+				},
+			},
+			wantStatus: http.StatusOK,
+			wantKeys:   []string{"alpha/one", "alpha/two"},
+		},
+		{
+			name:       "empty list",
+			prefix:     "missing/",
+			listResult: origin.ListResult{},
+			wantStatus: http.StatusOK,
+			wantKeys:   nil,
+		},
+		{
+			name: "truncated list",
+			listResult: origin.ListResult{
+				Entries:     []origin.ObjectEntry{{Key: "k1"}},
+				IsTruncated: true,
+				NextMarker:  "next-page",
+			},
+			wantStatus:  http.StatusOK,
+			wantKeys:    []string{"k1"},
+			wantTrunc:   true,
+			wantNextTok: "next-page",
+		},
+		{
+			name:       "origin error yields 502",
+			listErr:    errors.New("upstream broken"),
+			wantStatus: http.StatusBadGateway,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			or := &fakeOrigin{
+				ListFunc: func(_ context.Context, bucket, prefix, _ string, _ int) (origin.ListResult, error) {
+					if bucket != "b" {
+						t.Errorf("bucket=%q want %q", bucket, "b")
+					}
+
+					if prefix != tt.prefix {
+						t.Errorf("prefix=%q want %q", prefix, tt.prefix)
+					}
+
+					return tt.listResult, tt.listErr
+				},
+			}
+			fc := &fakeEdgeAPI{OriginVal: or}
+			h := NewEdgeHandler(fc, &config.Config{}, discardLogger())
+
+			req := httptest.NewRequest(http.MethodGet,
+				"/b/?list-type=2&prefix="+tt.prefix, nil)
+			rr := httptest.NewRecorder()
+			h.handleList(rr, req, "b")
+
+			if rr.Code != tt.wantStatus {
+				t.Errorf("status=%d want %d body=%s", rr.Code, tt.wantStatus, rr.Body.String())
+			}
+
+			if tt.wantStatus != http.StatusOK {
+				return
+			}
+
+			var got struct {
+				XMLName     xml.Name `xml:"ListBucketResult"`
+				Name        string   `xml:"Name"`
+				Prefix      string   `xml:"Prefix"`
+				KeyCount    int      `xml:"KeyCount"`
+				IsTruncated bool     `xml:"IsTruncated"`
+				NextMarker  string   `xml:"NextContinuationToken"`
+				Contents    []struct {
+					Key string `xml:"Key"`
+				} `xml:"Contents"`
+			}
+			if err := xml.Unmarshal(rr.Body.Bytes(), &got); err != nil {
+				t.Fatalf("xml decode: %v body=%s", err, rr.Body.String())
+			}
+
+			if got.Name != "b" {
+				t.Errorf("Name=%q want %q", got.Name, "b")
+			}
+
+			if got.Prefix != tt.prefix {
+				t.Errorf("Prefix=%q want %q", got.Prefix, tt.prefix)
+			}
+
+			if got.KeyCount != len(tt.wantKeys) {
+				t.Errorf("KeyCount=%d want %d", got.KeyCount, len(tt.wantKeys))
+			}
+
+			if got.IsTruncated != tt.wantTrunc {
+				t.Errorf("IsTruncated=%v want %v", got.IsTruncated, tt.wantTrunc)
+			}
+
+			if got.NextMarker != tt.wantNextTok {
+				t.Errorf("NextMarker=%q want %q", got.NextMarker, tt.wantNextTok)
+			}
+
+			gotKeys := make([]string, 0, len(got.Contents))
+			for _, c := range got.Contents {
+				gotKeys = append(gotKeys, c.Key)
+			}
+
+			if !equalStrings(gotKeys, tt.wantKeys) {
+				t.Errorf("keys=%v want %v", gotKeys, tt.wantKeys)
+			}
+		})
+	}
+}
+
+// TestParseSimpleByteRange covers all parser branches.
+func TestParseSimpleByteRange(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name      string
+		header    string
+		size      int64
+		wantStart int64
+		wantEnd   int64
+		wantOK    bool
+	}{
+		{"normal range", "bytes=0-99", 1024, 0, 99, true},
+		{"suffix range", "bytes=-100", 1024, 924, 1023, true},
+		{"open-ended", "bytes=100-", 1024, 100, 1023, true},
+		{"end clamped to size", "bytes=0-9999", 1024, 0, 1023, true},
+		{"start > end rejected", "bytes=100-50", 1024, 0, 0, false},
+		{"missing prefix rejected", "0-99", 1024, 0, 0, false},
+		{"multi-range rejected", "bytes=0-99,200-299", 1024, 0, 0, false},
+		{"empty rejected", "", 1024, 0, 0, false},
+		{"bytes= alone rejected", "bytes=", 1024, 0, 0, false},
+		{"suffix larger than size rejected", "bytes=-9999", 1024, 0, 0, false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			s, e, ok := parseSimpleByteRange(tt.header, tt.size)
+			if ok != tt.wantOK {
+				t.Fatalf("ok=%v want %v (s=%d e=%d)", ok, tt.wantOK, s, e)
+			}
+
+			if !ok {
+				return
+			}
+
+			if s != tt.wantStart || e != tt.wantEnd {
+				t.Errorf("(s,e)=(%d,%d) want (%d,%d)", s, e, tt.wantStart, tt.wantEnd)
+			}
+		})
+	}
+}
+
+// TestSplitPath covers path splitting edge cases.
+func TestSplitPath(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		in         string
+		wantBucket string
+		wantKey    string
+	}{
+		{"", "", ""},
+		{"/", "", ""},
+		{"/bucket", "bucket", ""},
+		{"/bucket/", "bucket", ""},
+		{"/bucket/key", "bucket", "key"},
+		{"/bucket/path/to/key", "bucket", "path/to/key"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.in, func(t *testing.T) {
+			b, k := splitPath(tt.in)
+			if b != tt.wantBucket || k != tt.wantKey {
+				t.Errorf("splitPath(%q)=(%q,%q) want (%q,%q)",
+					tt.in, b, k, tt.wantBucket, tt.wantKey)
+			}
+		})
+	}
+}
+
+// TestSetObjectHeaders covers header propagation including the
+// always-set Accept-Ranges and the conditionally-set fields.
+func TestSetObjectHeaders(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		info origin.ObjectInfo
+		want map[string]string
+	}{
+		{
+			name: "all fields set",
+			info: origin.ObjectInfo{ETag: "abc", ContentType: "text/plain"},
+			want: map[string]string{
+				"ETag":          `"abc"`,
+				"Content-Type":  "text/plain",
+				"Accept-Ranges": "bytes",
+			},
+		},
+		{
+			name: "missing content type",
+			info: origin.ObjectInfo{ETag: "abc"},
+			want: map[string]string{
+				"ETag":          `"abc"`,
+				"Content-Type":  "",
+				"Accept-Ranges": "bytes",
+			},
+		},
+		{
+			name: "missing etag",
+			info: origin.ObjectInfo{ContentType: "text/plain"},
+			want: map[string]string{
+				"ETag":          "",
+				"Content-Type":  "text/plain",
+				"Accept-Ranges": "bytes",
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			rr := httptest.NewRecorder()
+			setObjectHeaders(rr, tt.info)
+
+			for k, want := range tt.want {
+				if got := rr.Header().Get(k); got != want {
+					t.Errorf("header %s=%q want %q", k, got, want)
+				}
+			}
+		})
+	}
+}
+
+// errReader is an io.ReadCloser whose first Read returns errFirst.
+// Used to simulate cachestore-backed bodies that fail on their first
+// network read (e.g. azureblob returning a 503 mid-stream after the
+// header transaction succeeded).
+type errReader struct {
+	errFirst error
+	closed   bool
+}
+
+func (r *errReader) Read(_ []byte) (int, error) { return 0, r.errFirst }
+func (r *errReader) Close() error               { r.closed = true; return nil }
+
+// TestHandleGet_EmptyObject_NoRange_Returns200 verifies that a GET
+// against a zero-byte object responds with 200 + Content-Length: 0
+// and an empty body. Previously the handler computed rangeEnd = -1
+// and fell into the unsatisfiable-range branch, returning a spurious
+// 416 for what should be a successful empty-body fetch.
+func TestHandleGet_EmptyObject_NoRange_Returns200(t *testing.T) {
+	t.Parallel()
+
+	info := origin.ObjectInfo{Size: 0, ETag: "etag-empty", ContentType: "application/octet-stream"}
+
+	fc := &fakeEdgeAPI{
+		HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+			return info, nil
+		},
+		// GetChunkFunc deliberately unset; the short-circuit must
+		// not call into the fetch coordinator for zero-byte objects.
+	}
+
+	cfg := &config.Config{Chunking: config.Chunking{Size: 1024}}
+	h := NewEdgeHandler(fc, cfg, discardLogger())
+
+	req := httptest.NewRequest(http.MethodGet, "/bucket/empty", nil)
+	rr := httptest.NewRecorder()
+	h.handleGet(rr, req, "bucket", "empty")
+
+	if rr.Code != http.StatusOK {
+		t.Errorf("status=%d want %d", rr.Code, http.StatusOK)
+	}
+
+	if rr.Body.Len() != 0 {
+		t.Errorf("body=%d bytes, want 0", rr.Body.Len())
+	}
+
+	if got := rr.Header().Get("Content-Length"); got != "0" {
+		t.Errorf("Content-Length=%q want %q", got, "0")
+	}
+}
+
+// TestHandleGet_EmptyObject_WithRange_Returns416 verifies that a
+// Range request against a zero-byte object remains a 416. RFC 7233
+// classifies any range over a zero-byte representation as
+// unsatisfiable.
+func TestHandleGet_EmptyObject_WithRange_Returns416(t *testing.T) {
+	t.Parallel()
+
+	info := origin.ObjectInfo{Size: 0, ETag: "etag-empty"}
+
+	fc := &fakeEdgeAPI{
+		HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+			return info, nil
+		},
+	}
+
+	cfg := &config.Config{Chunking: config.Chunking{Size: 1024}}
+	h := NewEdgeHandler(fc, cfg, discardLogger())
+
+	req := httptest.NewRequest(http.MethodGet, "/bucket/empty", nil)
+	req.Header.Set("Range", "bytes=0-0")
+
+	rr := httptest.NewRecorder()
+	h.handleGet(rr, req, "bucket", "empty")
+
+	if rr.Code != http.StatusRequestedRangeNotSatisfiable {
+		t.Errorf("status=%d want %d", rr.Code, http.StatusRequestedRangeNotSatisfiable)
+	}
+}
+
+// TestHandleGet_FirstChunkErrorReturnsCleanError verifies that when
+// the very first chunk fetch fails the edge handler responds with an
+// S3-style error response (proper status + error body) rather than
+// committing a 200 status and then aborting the connection
+// mid-stream.
+//
+// Regression test for B4.
+func TestHandleGet_FirstChunkErrorReturnsCleanError(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name       string
+		fetchErr   error
+		peekErr    error // non-nil means GetChunk succeeds but first Read fails
+		wantStatus int
+		wantBody   string // substring assertion on the error body
+	}{
+		{
+			name:       "GetChunk returns NotFound",
+			fetchErr:   origin.ErrNotFound,
+			wantStatus: http.StatusNotFound,
+			wantBody:   "NoSuchKey",
+		},
+		{
+			name:       "GetChunk returns generic origin error",
+			fetchErr:   errors.New("origin: connect: timeout"),
+			wantStatus: http.StatusBadGateway,
+			wantBody:   "OriginUnreachable",
+		},
+		{
+			name:       "GetChunk succeeds but first Read fails",
+			peekErr:    errors.New("cachestore: blob fetch 503"),
+			wantStatus: http.StatusBadGateway,
+			wantBody:   "OriginUnreachable",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			info := origin.ObjectInfo{
+				Size:        1024,
+				ETag:        "etag1",
+				ContentType: "application/octet-stream",
+			}
+
+			fc := &fakeEdgeAPI{
+				HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+					return info, nil
+				},
+				GetChunkFunc: func(_ context.Context, _ chunk.Key, _ int64) (io.ReadCloser, error) {
+					if tt.fetchErr != nil {
+						return nil, tt.fetchErr
+					}
+
+					return &errReader{errFirst: tt.peekErr}, nil
+				},
+			}
+
+			cfg := &config.Config{Chunking: config.Chunking{Size: 1024}}
+			h := NewEdgeHandler(fc, cfg, discardLogger())
+
+			req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil)
+			rr := httptest.NewRecorder()
+			h.handleGet(rr, req, "bucket", "key")
+
+			if rr.Code != tt.wantStatus {
+				t.Errorf("status=%d want %d; body=%q", rr.Code, tt.wantStatus, rr.Body.String())
+			}
+
+			if !strings.Contains(rr.Body.String(), tt.wantBody) {
+				t.Errorf("body=%q want substring %q", rr.Body.String(), tt.wantBody)
+			}
+			// A bug here would 200 first, then write nothing or
+			// partial bytes; verify the response did not commit a
+			// success status that contradicts the error.
+			if rr.Code == http.StatusOK {
+				t.Errorf("handler committed 200 before failure became known")
+			}
+		})
+	}
+}
+
+type fakeInternalFetchAPI struct {
+	body []byte
+}
+
+func (f *fakeInternalFetchAPI) FillForPeer(_ context.Context, _ chunk.Key, _ int64) (io.ReadCloser, error) {
+	return io.NopCloser(strings.NewReader(string(f.body))), nil
+}
+
+// singleSelfPeerSource produces a peer-set containing only self.
+// IsCoordinator therefore returns true for every key, letting the
+// internal-fill handler proceed past its coordinator check without
+// requiring the test to know the rendezvous-hash outcome.
+type singleSelfPeerSource struct{}
+
+func (singleSelfPeerSource) Peers(_ context.Context) ([]cluster.Peer, error) {
+	return []cluster.Peer{{IP: "10.0.0.1", Self: true}}, nil
+}
+
+// TestInternalHandler_SetsContentLength verifies the internal-fill
+// handler sets Content-Length to chunk.Key.ExpectedLen(objectSize)
+// on the response. Setting the header allows the requesting peer to
+// detect mid-stream truncation via net/http's standard io.ErrUnexpectedEOF
+// surfacing; without it, a truncated peer response would be
+// indistinguishable from a clean EOF.
+//
+// Regression test for B7.
+func TestInternalHandler_SetsContentLength(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name       string
+		chunkSize  int64
+		index      int64
+		objectSize int64
+		wantLen    string
+	}{
+		{
+			name:       "full chunk",
+			chunkSize:  1024,
+			index:      0,
+			objectSize: 4096,
+			wantLen:    "1024",
+		},
+		{
+			// The fake body returns chunkSize=1024 bytes but the
+			// tail-chunk ExpectedLen is 428 (3500 - 3*1024). The
+			// resulting Content-Length: 428 can only come from the
+			// handler computing ExpectedLen explicitly, proving the
+			// header is not auto-derived from the body length.
+			name:       "tail chunk partial",
+			chunkSize:  1024,
+			index:      3,
+			objectSize: 3500,
+			wantLen:    "428",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			c, err := cluster.New(t.Context(),
+				config.Cluster{
+					Service:           "test",
+					SelfPodIP:         "10.0.0.1",
+					MembershipRefresh: time.Hour,
+					InternalListen:    "0.0.0.0:8444",
+				},
+				cluster.WithPeerSource(singleSelfPeerSource{}),
+			)
+			if err != nil {
+				t.Fatalf("cluster.New: %v", err)
+			}
+
+			t.Cleanup(func() { _ = c.Close(context.Background()) })
+
+			h := NewInternalHandler(&fakeInternalFetchAPI{body: make([]byte, tt.chunkSize)}, c, discardLogger())
+
+			req := httptest.NewRequest(http.MethodGet, "/internal/fill?"+(func() string {
+				k := chunk.Key{
+					OriginID:  "origin",
+					Bucket:    "bucket",
+					ObjectKey: "key",
+					ETag:      "etag",
+					ChunkSize: tt.chunkSize,
+					Index:     tt.index,
+				}
+
+				return encodeQuery(k, tt.objectSize)
+			})(), nil)
+			req.Header.Set("X-Orca-Internal", "1")
+
+			rr := httptest.NewRecorder()
+			h.ServeHTTP(rr, req)
+
+			if rr.Code != http.StatusOK {
+				t.Fatalf("status = %d want 200; body=%q", rr.Code, rr.Body.String())
+			}
+
+			got := rr.Header().Get("Content-Length")
+			if got != tt.wantLen {
+				t.Errorf("Content-Length = %q want %q", got, tt.wantLen)
+			}
+		})
+	}
+}
+
+// encodeQuery duplicates cluster.encodeChunkKey for test purposes
+// (it is unexported in the cluster package).
+func encodeQuery(k chunk.Key, objectSize int64) string {
+	return "origin_id=" + k.OriginID +
+		"&bucket=" + k.Bucket +
+		"&key=" + k.ObjectKey +
+		"&etag=" + k.ETag +
+		"&chunk_size=" + strconv.FormatInt(k.ChunkSize, 10) +
+		"&index=" + strconv.FormatInt(k.Index, 10) +
+		"&object_size=" + strconv.FormatInt(objectSize, 10)
+}
+
+// helpers
+
+// TestEdgeHandler_DebugEmissions verifies that the edge handler
+// emits a debug-level 'edge_request' trace at entry and at least
+// one of the response-shape emissions for HEAD/GET. Operators rely
+// on these to trace a single request across the structured-log
+// output.
+func TestEdgeHandler_DebugEmissions(t *testing.T) {
+	t.Parallel()
+
+	info := origin.ObjectInfo{Size: 5, ETag: "etag-xyz", ContentType: "application/octet-stream"}
+
+	fc := &fakeEdgeAPI{
+		HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+			return info, nil
+		},
+	}
+
+	var buf bytes.Buffer
+
+	cfg := &config.Config{Chunking: config.Chunking{Size: 1024}}
+	h := NewEdgeHandler(fc, cfg, debugLoggerTo(&buf))
+
+	req := httptest.NewRequest(http.MethodHead, "/bkt/obj", nil)
+	rr := httptest.NewRecorder()
+	h.ServeHTTP(rr, req)
+
+	out := buf.String()
+	for _, want := range []string{"edge_request", "edge_head_response", "bucket=bkt", "key=obj"} {
+		if !strings.Contains(out, want) {
+			t.Errorf("expected %q in debug output; got %q", want, out)
+		}
+	}
+}
+
+func discardLogger() *slog.Logger {
+	return slog.New(slog.NewTextHandler(io.Discard, nil))
+}
+
+// debugLoggerTo returns a slog.Logger that writes Debug-and-above
+// emissions to buf. Used by tests asserting debug-trace emission
+// at known call sites.
+func debugLoggerTo(buf *bytes.Buffer) *slog.Logger {
+	return slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelDebug}))
+}
+
+func equalStrings(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+
+	return true
+}
+
+// readaheadConfig returns a config tailored for readahead unit tests.
+// Origin.ID is required by the chunk-key construction inside
+// handleGet; chunk size and readahead are explicit so each test
+// controls them independently.
+func readaheadConfig(chunkSize int64, readahead int) *config.Config {
+	r := readahead
+
+	return &config.Config{
+		Origin: config.Origin{ID: "origin"},
+		Chunking: config.Chunking{
+			Size:      chunkSize,
+			Readahead: &r,
+		},
+	}
+}
+
+// makeChunkData returns a chunkSize-byte payload whose contents
+// encode the chunk index so test assertions can verify that the
+// streamed body delivers chunks in correct order. Each byte at
+// offset b within chunk i is `byte((int(i) + b) % 251)`; using a
+// prime modulus avoids spurious alignment on power-of-two
+// boundaries.
+func makeChunkData(idx int64, n int) []byte {
+	out := make([]byte, n)
+	for b := 0; b < n; b++ {
+		out[b] = byte((int(idx) + b) % 251)
+	}
+
+	return out
+}
+
+// trackedReadCloser is an io.ReadCloser that records Close() calls
+// for the readahead-cancellation test. closedCh fires once on the
+// first Close().
+type trackedReadCloser struct {
+	io.Reader
+	closed   bool
+	closedCh chan struct{}
+}
+
+func (t *trackedReadCloser) Close() error {
+	if !t.closed {
+		t.closed = true
+		close(t.closedCh)
+	}
+
+	return nil
+}
+
+// TestHandleGet_DynamicChunkSize_SmallObject verifies a small object
+// (well below any tier threshold) uses the base Chunking.Size. The
+// fake fetch records the chunk-key sizes seen so we can assert the
+// edge handler is not regressing to the previous global-only chunk
+// size on the small-object path.
+func TestHandleGet_DynamicChunkSize_SmallObject(t *testing.T) {
+	t.Parallel()
+
+	info := origin.ObjectInfo{Size: 100 * (1 << 20), ETag: "etag", ContentType: "application/octet-stream"}
+
+	var (
+		mu        sync.Mutex
+		seenSizes []int64
+	)
+
+	fc := &fakeEdgeAPI{
+		HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+			return info, nil
+		},
+		GetChunkFunc: func(_ context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) {
+			mu.Lock()
+
+			seenSizes = append(seenSizes, k.ChunkSize)
+			mu.Unlock()
+
+			return io.NopCloser(bytes.NewReader(makeChunkData(k.Index, int(k.ExpectedLen(info.Size))))), nil
+		},
+	}
+
+	cfg := &config.Config{
+		Origin: config.Origin{ID: "origin"},
+		Chunking: config.Chunking{
+			Size: 8 << 20,
+			Tiers: []config.ChunkTier{
+				{MinObjectSize: 1 << 30, ChunkSize: 64 << 20},
+			},
+		},
+	}
+
+	h := NewEdgeHandler(fc, cfg, discardLogger())
+
+	req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil)
+	rr := httptest.NewRecorder()
+	h.handleGet(rr, req, "bucket", "key")
+
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status=%d want 200; body=%q", rr.Code, rr.Body.String())
+	}
+
+	mu.Lock()
+	defer mu.Unlock()
+
+	if len(seenSizes) == 0 {
+		t.Fatalf("no chunk fetches recorded")
+	}
+
+	for i, sz := range seenSizes {
+		if sz != 8<<20 {
+			t.Errorf("seenSizes[%d]=%d want 8 MiB (base)", i, sz)
+		}
+	}
+}
+
+// TestHandleGet_DynamicChunkSize_LargeObject verifies a large object
+// (above the tier threshold) uses the tier's ChunkSize and that the
+// number of chunks fetched matches the larger granularity (fewer
+// requests).
+func TestHandleGet_DynamicChunkSize_LargeObject(t *testing.T) {
+	t.Parallel()
+
+	// 700 GiB synthetic object; chunked at the 128 MiB tier this is
+	// 5600 chunks. We don't fetch them all in this test (we set up a
+	// fake that streams a tiny payload per chunk request), but we do
+	// confirm the chunk keys carry ChunkSize=128 MiB and the
+	// first-chunk path lands on Index=0.
+	const (
+		large  = int64(700) * (1 << 30) // 700 GiB
+		tierSz = int64(128) << 20       // 128 MiB
+		baseSz = int64(8) << 20         // 8 MiB
+	)
+
+	info := origin.ObjectInfo{Size: large, ETag: "etag", ContentType: "application/octet-stream"}
+
+	// To keep the test fast we use a Range request covering exactly
+	// the first chunk; otherwise the handler would attempt to stream
+	// 700 GiB. Range bytes=0-(tierSz-1) targets chunk 0 only.
+	var (
+		mu        sync.Mutex
+		seenSizes []int64
+		seenIdx   []int64
+	)
+
+	fc := &fakeEdgeAPI{
+		HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+			return info, nil
+		},
+		GetChunkFunc: func(_ context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) {
+			mu.Lock()
+
+			seenSizes = append(seenSizes, k.ChunkSize)
+			seenIdx = append(seenIdx, k.Index)
+			mu.Unlock()
+
+			return io.NopCloser(bytes.NewReader(makeChunkData(k.Index, int(k.ExpectedLen(info.Size))))), nil
+		},
+	}
+
+	cfg := &config.Config{
+		Origin: config.Origin{ID: "origin"},
+		Chunking: config.Chunking{
+			Size: baseSz,
+			Tiers: []config.ChunkTier{
+				{MinObjectSize: 10 * (1 << 30), ChunkSize: tierSz},
+			},
+		},
+	}
+
+	h := NewEdgeHandler(fc, cfg, discardLogger())
+
+	req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil)
+	req.Header.Set("Range", "bytes=0-"+strconv.FormatInt(tierSz-1, 10))
+
+	rr := httptest.NewRecorder()
+	h.handleGet(rr, req, "bucket", "key")
+
+	if rr.Code != http.StatusPartialContent {
+		t.Fatalf("status=%d want 206; body=%q", rr.Code, rr.Body.String())
+	}
+
+	mu.Lock()
+	defer mu.Unlock()
+
+	if len(seenSizes) != 1 {
+		t.Fatalf("expected exactly 1 chunk fetch for first-chunk range; got %d", len(seenSizes))
+	}
+
+	if seenSizes[0] != tierSz {
+		t.Errorf("seenSizes[0]=%d want %d (tier size)", seenSizes[0], tierSz)
+	}
+
+	if seenIdx[0] != 0 {
+		t.Errorf("seenIdx[0]=%d want 0", seenIdx[0])
+	}
+}
+
+// TestHandleGet_Readahead_DisabledZero verifies that Readahead=0
+// preserves the strictly-sequential behavior: GetChunk is called
+// one chunk at a time, in order, with no concurrent fetches in
+// flight. The fake fetch deliberately reports concurrent calls so a
+// regression that started the prefetcher despite depth=0 would be
+// caught.
+func TestHandleGet_Readahead_DisabledZero(t *testing.T) {
+	t.Parallel()
+
+	const (
+		chunkSize  = int64(1024)
+		nChunks    = int64(5)
+		objectSize = chunkSize * nChunks
+	)
+
+	info := origin.ObjectInfo{Size: objectSize, ETag: "e", ContentType: "application/octet-stream"}
+
+	var (
+		mu        sync.Mutex
+		inFlight  int
+		maxInFlt  int
+		callOrder []int64
+	)
+
+	fc := &fakeEdgeAPI{
+		HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+			return info, nil
+		},
+		GetChunkFunc: func(_ context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) {
+			mu.Lock()
+			inFlight++
+
+			if inFlight > maxInFlt {
+				maxInFlt = inFlight
+			}
+
+			callOrder = append(callOrder, k.Index)
+			mu.Unlock()
+			// Brief sleep to widen any concurrency window.
+			time.Sleep(5 * time.Millisecond)
+
+			mu.Lock()
+			inFlight--
+			mu.Unlock()
+
+			return io.NopCloser(bytes.NewReader(makeChunkData(k.Index, int(chunkSize)))), nil
+		},
+	}
+
+	cfg := readaheadConfig(chunkSize, 0)
+	h := NewEdgeHandler(fc, cfg, discardLogger())
+
+	req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil)
+	rr := httptest.NewRecorder()
+	h.handleGet(rr, req, "bucket", "key")
+
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status=%d want 200; body=%q", rr.Code, rr.Body.String())
+	}
+
+	if int64(rr.Body.Len()) != objectSize {
+		t.Errorf("body=%d bytes, want %d", rr.Body.Len(), objectSize)
+	}
+
+	mu.Lock()
+	defer mu.Unlock()
+
+	if maxInFlt != 1 {
+		t.Errorf("max in-flight=%d want 1 (no readahead)", maxInFlt)
+	}
+
+	for i, idx := range callOrder {
+		if idx != int64(i) {
+			t.Errorf("callOrder[%d]=%d want %d (in-order serial fetch)", i, idx, i)
+		}
+	}
+}
+
+// TestHandleGet_Readahead_ParallelHidesLatency verifies that with
+// Readahead > 0 the handler can have multiple chunk fetches in
+// flight concurrently. The fake fetch sleeps long enough per chunk
+// that the wall-clock time for the full GET should be substantially
+// less than nChunks * perChunkDelay if readahead is working.
+func TestHandleGet_Readahead_ParallelHidesLatency(t *testing.T) {
+	t.Parallel()
+
+	const (
+		chunkSize   = int64(1024)
+		nChunks     = int64(5)
+		objectSize  = chunkSize * nChunks
+		perChunkLat = 40 * time.Millisecond
+		readahead   = 4
+	)
+
+	info := origin.ObjectInfo{Size: objectSize, ETag: "e", ContentType: "application/octet-stream"}
+
+	var (
+		mu       sync.Mutex
+		inFlight int
+		maxInFlt int
+	)
+
+	fc := &fakeEdgeAPI{
+		HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+			return info, nil
+		},
+		GetChunkFunc: func(ctx context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) {
+			mu.Lock()
+			inFlight++
+
+			if inFlight > maxInFlt {
+				maxInFlt = inFlight
+			}
+			mu.Unlock()
+
+			select {
+			case <-time.After(perChunkLat):
+			case <-ctx.Done():
+				mu.Lock()
+				inFlight--
+				mu.Unlock()
+
+				return nil, ctx.Err()
+			}
+
+			mu.Lock()
+			inFlight--
+			mu.Unlock()
+
+			return io.NopCloser(bytes.NewReader(makeChunkData(k.Index, int(chunkSize)))), nil
+		},
+	}
+
+	cfg := readaheadConfig(chunkSize, readahead)
+	h := NewEdgeHandler(fc, cfg, discardLogger())
+
+	req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil)
+	rr := httptest.NewRecorder()
+
+	start := time.Now()
+
+	h.handleGet(rr, req, "bucket", "key")
+
+	elapsed := time.Since(start)
+
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status=%d want 200; body=%q", rr.Code, rr.Body.String())
+	}
+
+	if int64(rr.Body.Len()) != objectSize {
+		t.Errorf("body=%d bytes, want %d", rr.Body.Len(), objectSize)
+	}
+
+	// Strict serial baseline = nChunks * perChunkLat. With readahead
+	// we expect substantially less; we conservatively assert <
+	// (nChunks * perChunkLat * 0.8) which gives the test plenty of
+	// CI slack. The exact speedup depends on scheduler timing; the
+	// in-flight max metric below is the deterministic assertion.
+	serialBaseline := time.Duration(nChunks) * perChunkLat
+
+	if elapsed >= serialBaseline {
+		t.Errorf("readahead did not hide latency: elapsed=%v, serial baseline=%v",
+			elapsed, serialBaseline)
+	}
+
+	mu.Lock()
+	defer mu.Unlock()
+
+	if maxInFlt < 2 {
+		t.Errorf("max in-flight=%d want >= 2 (readahead concurrent)", maxInFlt)
+	}
+}
+
+// TestHandleGet_Readahead_CancellationClosesBodies verifies that
+// when the streaming consumer aborts mid-response (e.g. a downstream
+// write fails), every prefetched body still buffered in the
+// readahead channel is Close()d on the way out. Without this the
+// cachestore would leak HTTP response bodies whenever a client
+// disconnects partway through a large blob.
+//
+// Setup: the handler streams to an http.ResponseWriter wrapped to
+// return an io.ErrShortWrite after a fixed byte count, forcing the
+// streamSlice call to abort mid-chunk. We then assert that every
+// trackedReadCloser handed out has had Close() called.
+func TestHandleGet_Readahead_CancellationClosesBodies(t *testing.T) {
+	t.Parallel()
+
+	const (
+		chunkSize  = int64(256)
+		nChunks    = int64(8)
+		objectSize = chunkSize * nChunks
+		readahead  = 4
+	)
+
+	info := origin.ObjectInfo{Size: objectSize, ETag: "e", ContentType: "application/octet-stream"}
+
+	var (
+		mu     sync.Mutex
+		bodies []*trackedReadCloser
+	)
+
+	fc := &fakeEdgeAPI{
+		HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+			return info, nil
+		},
+		GetChunkFunc: func(_ context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) {
+			b := &trackedReadCloser{
+				Reader:   bytes.NewReader(makeChunkData(k.Index, int(chunkSize))),
+				closedCh: make(chan struct{}),
+			}
+
+			mu.Lock()
+
+			bodies = append(bodies, b)
+			mu.Unlock()
+
+			return b, nil
+		},
+	}
+
+	cfg := readaheadConfig(chunkSize, readahead)
+	h := NewEdgeHandler(fc, cfg, discardLogger())
+
+	// shortWriter writes the first maxBytes bytes to inner and
+	// returns io.ErrShortWrite on any further write. Reproduces a
+	// client connection that closes mid-stream.
+	rr := httptest.NewRecorder()
+	w := &shortWriter{inner: rr, maxBytes: int(chunkSize) + int(chunkSize)/2} // 1.5 chunks
+
+	req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil)
+	h.handleGet(w, req, "bucket", "key")
+
+	// All bodies handed out should be closed; allow a brief window
+	// for the producer goroutine to observe ctx-cancellation and
+	// close its in-flight body via the select branch.
+	deadline := time.After(2 * time.Second)
+
+	for i := 0; ; i++ {
+		mu.Lock()
+		allClosed := true
+
+		for _, b := range bodies {
+			if !b.closed {
+				allClosed = false
+				break
+			}
+		}
+
+		count := len(bodies)
+		mu.Unlock()
+
+		if allClosed && count > 1 {
+			// Multiple bodies were handed out and all are closed.
+			return
+		}
+
+		select {
+		case <-deadline:
+			mu.Lock()
+			defer mu.Unlock()
+
+			if count <= 1 {
+				t.Fatalf("only %d bodies handed out; readahead did not engage", count)
+			}
+
+			for j, b := range bodies {
+				if !b.closed {
+					t.Errorf("body[%d] (chunk index %d) not closed", j, j)
+				}
+			}
+
+			return
+		default:
+			time.Sleep(10 * time.Millisecond)
+		}
+
+		_ = i
+	}
+}
+
+// TestHandleGet_Readahead_ProducerPanicRecovered verifies that a
+// panic inside the readahead producer goroutine is recovered, logged,
+// and does not deadlock the consumer or crash the process. The
+// consumer should see an early channel close and treat the response
+// as a mid-stream abort.
+func TestHandleGet_Readahead_ProducerPanicRecovered(t *testing.T) {
+	t.Parallel()
+
+	const (
+		chunkSize  = int64(256)
+		nChunks    = int64(6)
+		objectSize = chunkSize * nChunks
+		readahead  = 2
+	)
+
+	info := origin.ObjectInfo{Size: objectSize, ETag: "e", ContentType: "application/octet-stream"}
+
+	var (
+		mu      sync.Mutex
+		calls   int64
+		panicAt = int64(3) // panic on the 3rd GetChunk
+	)
+
+	fc := &fakeEdgeAPI{
+		HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) {
+			return info, nil
+		},
+		GetChunkFunc: func(_ context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) {
+			mu.Lock()
+			calls++
+			n := calls
+			mu.Unlock()
+
+			if n == panicAt {
+				panic("readahead test: synthetic producer panic")
+			}
+
+			return io.NopCloser(bytes.NewReader(makeChunkData(k.Index, int(chunkSize)))), nil
+		},
+	}
+
+	var logBuf bytes.Buffer
+
+	cfg := readaheadConfig(chunkSize, readahead)
+	h := NewEdgeHandler(fc, cfg, debugLoggerTo(&logBuf))
+
+	req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil)
+	rr := httptest.NewRecorder()
+
+	done := make(chan struct{})
+
+	go func() {
+		defer close(done)
+
+		h.handleGet(rr, req, "bucket", "key")
+	}()
+
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatalf("handler deadlocked after producer panic")
+	}
+
+	// The first chunk was peeked and streamed successfully (a
+	// committed 200 response). Subsequent panic is a mid-stream
+	// abort; the response code is therefore 200 even though the
+	// body is truncated.
+	if rr.Code != http.StatusOK {
+		t.Errorf("status=%d want 200 (panic is mid-stream)", rr.Code)
+	}
+
+	out := logBuf.String()
+	if !strings.Contains(out, "readahead worker panic") {
+		t.Errorf("missing 'readahead worker panic' in log; got %q", out)
+	}
+}
+
+// shortWriter writes the first maxBytes bytes to inner then returns
+// io.ErrShortWrite on any subsequent Write. Used to simulate a
+// client connection that drops mid-response.
+type shortWriter struct {
+	inner    http.ResponseWriter
+	written  int
+	maxBytes int
+}
+
+func (s *shortWriter) Header() http.Header { return s.inner.Header() }
+
+func (s *shortWriter) WriteHeader(code int) { s.inner.WriteHeader(code) }
+
+func (s *shortWriter) Write(p []byte) (int, error) {
+	if s.written >= s.maxBytes {
+		return 0, io.ErrShortWrite
+	}
+
+	remaining := s.maxBytes - s.written
+	if len(p) > remaining {
+		// Write exactly up to the cap, then fail any further calls.
+		n, _ := s.inner.Write(p[:remaining])
+		s.written += n
+
+		return n, io.ErrShortWrite
+	}
+
+	n, err := s.inner.Write(p)
+	s.written += n
+
+	return n, err
+}