diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 76acf952..fa261da4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -128,6 +128,34 @@ jobs: retention-days: 7 if-no-files-found: ignore + # ---------- Orca Integration Tests ---------- + # Spins up LocalStack and Azurite via testcontainers-go and runs the + # orca in-process integration suite (internal/orca/inttest). Docker + # is preinstalled on GitHub-hosted Ubuntu runners; no extra services: + # block is required. + orca-inttest: + name: Orca Integration Tests + needs: [frontend] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Download frontend dist + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 + with: + name: frontend-dist + path: internal/net/html/dist + + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + cache-dependency-path: go.sum + + - name: Run orca-inttest + run: make orca-inttest + # ---------- Build ---------- build: name: Build diff --git a/Makefile b/Makefile index 5be64f18..1c0134c8 100644 --- a/Makefile +++ b/Makefile @@ -74,6 +74,14 @@ STAMP_LDFLAGS=-X github.com/Azure/unbounded/internal/version.Version=$(VERSION) METALMAN_IMAGE=$(CONTAINER_REGISTRY)/metalman:$(VERSION) +# Orca configuration +ORCA_BIN=bin/orca +ORCA_CMD=./cmd/orca +ORCA_IMAGE ?= $(CONTAINER_REGISTRY)/orca:$(VERSION) +ORCA_NAMESPACE ?= unbounded-kube +ORCA_MANIFEST_TEMPLATES_DIR := deploy/orca +ORCA_MANIFEST_RENDERED_DIR := deploy/orca/rendered + # kubectl-unbounded also stamps the metalman image reference. KUBECTL_UNBOUNDED_LDFLAGS=$(STAMP_LDFLAGS) -X github.com/Azure/unbounded/cmd/kubectl-unbounded/app.MetalmanImage=$(METALMAN_IMAGE) @@ -112,6 +120,7 @@ REACT_DEV ?= false .PHONY: all help fmt lint test build vulncheck check-deps kubectl-unbounded kubectl-unbounded-build install-tools install-protoc generate kubectl-unbounded forge unbounded-agent machina machina-build machina-oci machina-oci-push machina-manifests machine-ops-controller machine-ops-controller-build machine-ops-controller-oci machine-ops-controller-oci-push machine-ops-manifests metalman metalman-build metalman-oci metalman-oci-push gomod docs-serve unbounded-net-controller unbounded-net-node unbounded-net-routeplan-debug unping unroute notice notice-check .PHONY: net-frontend net-frontend-clean net-build-ebpf net-manifests release-manifests .PHONY: image-machina-local image-machine-ops-controller-local image-metalman-local image-net-controller-local image-net-node-local images-local +.PHONY: orca orca-build orca-manifests orca-oci orca-oci-push orca-up orca-down orca-reset orca-inttest image-orca-local ##@ General @@ -176,6 +185,8 @@ help: ## Show this help @echo " machina-oci-push Build machina image and push" @echo " machine-ops-controller-oci-push Build machine-ops-controller image and push" @echo " metalman-oci-push Build metalman image and push" + @echo " image-orca-local Build orca image" + @echo " orca-oci-push Build orca image and push" @echo "" @echo "Net Frontend:" @echo " net-frontend Build frontend into \$$(NET_FRONTEND_DIST_DIR) (cached)" @@ -188,10 +199,19 @@ help: ## Show this help @echo " machina-manifests Render machina manifests into deploy/machina/rendered" @echo " machine-ops-manifests Render machine-ops manifests into deploy/machine-ops/rendered" @echo " net-manifests Render net manifests into \$$(NET_MANIFEST_RENDERED_DIR)" + @echo " orca-manifests Render orca manifests into deploy/orca/rendered" @echo "" @echo "Net Kubernetes (apply to current kubectl context):" @echo " See \`make -C hack/net help\` for cluster deploy/undeploy targets." @echo "" + @echo "Orca Dev Harness (Kind cluster):" + @echo " orca | orca-build Build orca binary (with/without lint/test)" + @echo " orca-up Bring up Orca dev harness in Kind" + @echo " orca-down Tear down Orca dev harness Kind cluster" + @echo " orca-reset Rebuild image and rollout-restart deployment" + @echo " orca-inttest Run orca integration tests (Docker required)" + @echo " See \`make -C hack/orca help\` for full list." + @echo "" @echo "Documentation:" @echo " docs-serve Start local Hugo dev server" @echo "" @@ -570,6 +590,58 @@ metalman-oci: image-metalman-local ## Alias for image-metalman-local metalman-oci-push: metalman-oci ## Build and push the metalman container image $(CONTAINER_ENGINE) push $(METALMAN_IMAGE) +##@ Orca + +orca-build: ## Build the orca binary (no lint/test) + $(GOBUILD) -ldflags '$(STAMP_LDFLAGS)' -o $(ORCA_BIN) $(ORCA_CMD)/main.go + +orca: test orca-build ## Build the orca binary (implies test) + +orca-manifests: ## Render orca deployment manifests into deploy/orca/rendered + @mkdir -p $(ORCA_MANIFEST_RENDERED_DIR) + @find $(ORCA_MANIFEST_RENDERED_DIR) -mindepth 1 -not -name .gitignore -delete 2>/dev/null || true + $(GOCMD) run ./hack/cmd/render-manifests \ + --templates-dir $(ORCA_MANIFEST_TEMPLATES_DIR) \ + --output-dir $(ORCA_MANIFEST_RENDERED_DIR) \ + --set Namespace=$(ORCA_NAMESPACE) \ + --set Image=$(ORCA_IMAGE) + @echo "Rendered orca manifests into $(ORCA_MANIFEST_RENDERED_DIR) (image: $(ORCA_IMAGE))" + +image-orca-local: ## Build the orca container image locally (single-arch) + $(CONTAINER_ENGINE) build \ + --build-arg VERSION=$(VERSION) \ + --build-arg GIT_COMMIT=$(GIT_COMMIT) \ + --build-arg BUILD_TIME=$(BUILD_TIME) \ + -t orca:$(VERSION) -t $(ORCA_IMAGE) \ + -f ./images/orca/Containerfile . + +orca-oci: image-orca-local ## Alias for image-orca-local + +orca-oci-push: orca-oci ## Build and push the orca container image + $(CONTAINER_ENGINE) push $(ORCA_IMAGE) + +# Dev-cluster proxy targets. The actual implementations live in +# hack/orca/Makefile (see AGENTS.md convention; mirrors hack/net/). +orca-up: ## Bring up the Orca dev harness in a Kind cluster + $(MAKE) -C hack/orca up + +orca-down: ## Tear down the Orca dev harness Kind cluster + $(MAKE) -C hack/orca down + +orca-reset: ## Rebuild orca image and rolling-restart the dev deployment + $(MAKE) -C hack/orca reset + +# orca-inttest mirrors the test/test-race pattern: race detector in CI +# (ubuntu-latest has gcc), no -race locally so developers without a C +# toolchain can still run integration tests. +ifdef CI +orca-inttest: ## Run orca integration tests (LocalStack + Azurite via testcontainers; requires Docker) + $(GOTEST) -tags=integrationtest -race -timeout 15m ./internal/orca/inttest/... +else +orca-inttest: ## Run orca integration tests (LocalStack + Azurite via testcontainers; requires Docker) + $(GOTEST) -tags=integrationtest -timeout 15m ./internal/orca/inttest/... +endif + image-net-controller-local: net-frontend resources/cni-plugins-linux-$(HOST_GOARCH)-$(CNI_PLUGINS_VERSION).tgz ## Build the unbounded-net-controller image locally (single-arch) $(CONTAINER_ENGINE) build \ --target controller \ diff --git a/cmd/orca/main.go b/cmd/orca/main.go new file mode 100644 index 00000000..f7ea8484 --- /dev/null +++ b/cmd/orca/main.go @@ -0,0 +1,10 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package main + +import "github.com/Azure/unbounded/cmd/orca/orca" + +func main() { + orca.Run() +} diff --git a/cmd/orca/orca/orca.go b/cmd/orca/orca/orca.go new file mode 100644 index 00000000..48ac19ae --- /dev/null +++ b/cmd/orca/orca/orca.go @@ -0,0 +1,134 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package orca wires the Orca cache binary together. It is invoked by +// cmd/orca/main.go and is responsible for parsing flags, loading the +// YAML config, and delegating to internal/orca/app for actual runtime +// wiring. +package orca + +import ( + "context" + "fmt" + "log/slog" + "os" + "os/signal" + "strings" + "syscall" + "time" + + "github.com/spf13/cobra" + + "github.com/Azure/unbounded/internal/orca/app" + "github.com/Azure/unbounded/internal/orca/config" +) + +// Run is the entrypoint invoked by cmd/orca/main.go. +func Run() { + root := &cobra.Command{ + Use: "orca", + Short: "Orca origin cache - S3-compatible read-only cache fronting Azure / S3 origins", + } + root.AddCommand(newServeCmd()) + + if err := root.Execute(); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} + +func newServeCmd() *cobra.Command { + var configPath string + + cmd := &cobra.Command{ + Use: "serve", + Short: "Run the Orca cache server", + RunE: func(cmd *cobra.Command, _ []string) error { + return serve(cmd.Context(), configPath) + }, + } + cmd.Flags().StringVarP(&configPath, "config", "c", "/etc/orca/config.yaml", + "path to YAML config file") + + return cmd +} + +func serve(parent context.Context, configPath string) error { + cfg, err := config.Load(configPath) + if err != nil { + return fmt.Errorf("load config: %w", err) + } + + level, err := resolveLogLevel(cfg.Logging.Level) + if err != nil { + return err + } + + levelVar := new(slog.LevelVar) + levelVar.Set(level) + + log := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: levelVar, + AddSource: true, + })) + slog.SetDefault(log) + + log.Info("orca starting", + "config_path", configPath, + "log_level", level.String(), + ) + + log.Info("config loaded", + "origin_id", cfg.Origin.ID, + "replicas_target", cfg.Cluster.TargetReplicas, + "target_global", cfg.Origin.TargetGlobal, + "internal_tls", cfg.Cluster.InternalTLS.Enabled, + "client_auth", cfg.Server.Auth.Enabled, + ) + + ctx, cancel := signal.NotifyContext(parent, os.Interrupt, syscall.SIGTERM) + defer cancel() + + a, err := app.Start(ctx, cfg, app.WithLogger(log)) + if err != nil { + return err + } + + if waitErr := a.Wait(ctx); waitErr != nil { + log.Error("listener exited with error", "err", waitErr) + cancel() + } else { + log.Info("shutdown signal received") + } + + shutdownCtx, shCancel := context.WithTimeout(context.Background(), 10*time.Second) + defer shCancel() + + // Propagate Shutdown errors to the process exit code so that + // failed-shutdown signals (kubelet probes, init systems) match + // reality. App.Shutdown also logs each individual error + // internally, so this only governs the exit-code semantics. + shutdownErr := a.Shutdown(shutdownCtx) + + log.Info("orca stopped") + + return shutdownErr +} + +// resolveLogLevel determines the effective slog.Level by consulting +// the ORCA_LOG_LEVEL environment variable first; if unset or empty, +// falls back to the YAML-configured value. An unrecognised value +// (from either source) returns a parse error so misconfiguration is +// surfaced at startup rather than silently degrading to info. +func resolveLogLevel(yamlLevel string) (slog.Level, error) { + if env := strings.TrimSpace(os.Getenv("ORCA_LOG_LEVEL")); env != "" { + level, err := config.ParseLogLevel(env) + if err != nil { + return 0, fmt.Errorf("ORCA_LOG_LEVEL: %w", err) + } + + return level, nil + } + + return config.ParseLogLevel(yamlLevel) +} diff --git a/cmd/orca/orca/orca_test.go b/cmd/orca/orca/orca_test.go new file mode 100644 index 00000000..ca3c3352 --- /dev/null +++ b/cmd/orca/orca/orca_test.go @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package orca + +import ( + "log/slog" + "testing" +) + +// TestResolveLogLevel_PrecedenceAndDefault covers the resolution +// order documented on resolveLogLevel: ORCA_LOG_LEVEL wins when +// set and non-empty (after trim), otherwise the YAML-configured +// value is used, otherwise the empty string defaults through +// config.ParseLogLevel to info. +func TestResolveLogLevel_PrecedenceAndDefault(t *testing.T) { + tests := []struct { + name string + yamlLevel string + envLevel string // "" -> simulate unset via Setenv with "" + want slog.Level + wantErr bool + }{ + {"empty yaml, no env -> info", "", "", slog.LevelInfo, false}, + {"yaml info, no env", "info", "", slog.LevelInfo, false}, + {"yaml debug, no env", "debug", "", slog.LevelDebug, false}, + {"yaml info overridden by env debug", "info", "debug", slog.LevelDebug, false}, + {"yaml debug overridden by env warn", "debug", "warn", slog.LevelWarn, false}, + {"whitespace env falls back to yaml", "warn", " ", slog.LevelWarn, false}, + {"invalid yaml fails", "trace", "", 0, true}, + {"invalid env fails even when yaml valid", "info", "trace", 0, true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv("ORCA_LOG_LEVEL", tt.envLevel) + + got, err := resolveLogLevel(tt.yamlLevel) + if tt.wantErr { + if err == nil { + t.Errorf("resolveLogLevel(%q) = %v, want error", tt.yamlLevel, got) + } + + return + } + + if err != nil { + t.Errorf("resolveLogLevel(%q) unexpected err: %v", tt.yamlLevel, err) + return + } + + if got != tt.want { + t.Errorf("resolveLogLevel(yaml=%q, env=%q) = %v, want %v", + tt.yamlLevel, tt.envLevel, got, tt.want) + } + }) + } +} diff --git a/deploy/orca/01-namespace.yaml.tmpl b/deploy/orca/01-namespace.yaml.tmpl new file mode 100644 index 00000000..fd353a35 --- /dev/null +++ b/deploy/orca/01-namespace.yaml.tmpl @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca diff --git a/deploy/orca/02-rbac.yaml.tmpl b/deploy/orca/02-rbac.yaml.tmpl new file mode 100644 index 00000000..5961196b --- /dev/null +++ b/deploy/orca/02-rbac.yaml.tmpl @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: orca + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca diff --git a/deploy/orca/03-config.yaml.tmpl b/deploy/orca/03-config.yaml.tmpl new file mode 100644 index 00000000..26ac7f82 --- /dev/null +++ b/deploy/orca/03-config.yaml.tmpl @@ -0,0 +1,74 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: orca-config + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca +data: + config.yaml: | + # Orca origin cache configuration. + # Secret values (account keys, S3 access/secret) are sourced from + # environment variables ORCA_AZUREBLOB_ACCOUNT_KEY, + # ORCA_CACHESTORE_S3_ACCESS_KEY, ORCA_CACHESTORE_S3_SECRET_KEY, + # populated by the orca-credentials Secret via envFrom. + + server: + listen: "0.0.0.0:8443" + auth: + # Dev: disabled. Production: enable bearer or mtls. + enabled: {{ default "false" .ServerAuthEnabled }} + + origin: + id: {{ default "azureblob-default" .OriginID | quote }} + driver: {{ default "azureblob" .OriginDriver }} + target_global: {{ default "192" .TargetGlobal }} + queue_timeout: 5s + retry: + attempts: 3 + backoff_initial: 100ms + backoff_max: 2s + max_total_duration: 5s + azureblob: + account: {{ default "" .AzureAccount | quote }} + container: {{ default "" .AzureContainer | quote }} + endpoint: {{ default "" .AzureEndpoint | quote }} + awss3: + endpoint: {{ default "" .OriginAWSS3Endpoint | quote }} + region: {{ default "us-east-1" .OriginAWSS3Region | quote }} + bucket: {{ default "" .OriginAWSS3Bucket | quote }} + use_path_style: {{ default "false" .OriginAWSS3UsePathStyle }} + + cachestore: + driver: s3 + s3: + endpoint: {{ default "http://localstack.unbounded-kube.svc.cluster.local:4566" .CachestoreEndpoint | quote }} + bucket: {{ default "orca-cache" .CachestoreBucket | quote }} + region: {{ default "us-east-1" .CachestoreRegion | quote }} + use_path_style: true + + cluster: + service: {{ default "orca-peers.unbounded-kube.svc.cluster.local" .ClusterService | quote }} + membership_refresh: 5s + internal_listen: "0.0.0.0:8444" + target_replicas: {{ default "3" .TargetReplicas }} + internal_tls: + # Dev: disabled (plain HTTP/2 between peers). Production: true. + enabled: {{ default "false" .InternalTLSEnabled }} + + chunk_catalog: + max_entries: 100000 + + metadata: + ttl: 5m + negative_ttl: 60s + max_entries: 10000 + + chunking: + size: 8388608 + + logging: + # One of debug, info, warn, error. Overridden at runtime by the + # ORCA_LOG_LEVEL environment variable when set. + level: {{ default "info" .LogLevel | quote }} diff --git a/deploy/orca/04-deployment.yaml.tmpl b/deploy/orca/04-deployment.yaml.tmpl new file mode 100644 index 00000000..d2f11397 --- /dev/null +++ b/deploy/orca/04-deployment.yaml.tmpl @@ -0,0 +1,91 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: orca + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca +spec: + replicas: {{ default "3" .TargetReplicas }} + # Required pod-anti-affinity below pins one Orca pod per node. + # In the dev harness the worker count == replica count, so default + # RollingUpdate can't surge: the new pod has no node to land on. + # maxSurge=0 / maxUnavailable=1 walks the replicas one-at-a-time. + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 + selector: + matchLabels: + app.kubernetes.io/name: orca + template: + metadata: + labels: + app.kubernetes.io/name: orca + spec: + serviceAccountName: orca + # Required anti-affinity: at most one Orca pod per node so that a + # single node failure does not knock out multiple replicas. The + # dev harness Kind cluster has 3 worker nodes to match the default + # 3 replicas. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/name: orca + topologyKey: kubernetes.io/hostname + containers: + - name: orca + image: {{ default "ghcr.io/azure/orca:latest" .Image | quote }} + imagePullPolicy: {{ default "IfNotPresent" .ImagePullPolicy }} + args: + - serve + - --config=/etc/orca/config.yaml + env: + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + envFrom: + - secretRef: + name: orca-credentials + ports: + - containerPort: 8443 + name: edge + protocol: TCP + - containerPort: 8444 + name: internal + protocol: TCP + - containerPort: 8442 + name: ops + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: ops + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /readyz + port: ops + initialDelaySeconds: 2 + periodSeconds: 5 + resources: + requests: + cpu: {{ default "200m" .ResourceCPURequest }} + memory: {{ default "256Mi" .ResourceMemoryRequest }} + limits: + cpu: {{ default "2" .ResourceCPULimit }} + memory: {{ default "1Gi" .ResourceMemoryLimit }} + volumeMounts: + - name: config + mountPath: /etc/orca + readOnly: true + volumes: + - name: config + configMap: + name: orca-config diff --git a/deploy/orca/05-service.yaml.tmpl b/deploy/orca/05-service.yaml.tmpl new file mode 100644 index 00000000..36dba4fd --- /dev/null +++ b/deploy/orca/05-service.yaml.tmpl @@ -0,0 +1,43 @@ +--- +# Client-facing Service: standard ClusterIP. Clients of the cache (e.g. +# tools speaking S3 to fetch objects) connect here. Kube-proxy load +# balances across the 3 replicas. +apiVersion: v1 +kind: Service +metadata: + name: orca + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: orca + ports: + - name: edge + port: 8443 + targetPort: edge + protocol: TCP + +--- +# Peer-discovery Service: headless (ClusterIP: None). LookupHost on +# orca-peers..svc.cluster.local returns all pod IPs, enabling +# rendezvous-hash coordination among Orca replicas. +apiVersion: v1 +kind: Service +metadata: + name: orca-peers + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca +spec: + type: ClusterIP + clusterIP: None + publishNotReadyAddresses: true + selector: + app.kubernetes.io/name: orca + ports: + - name: internal + port: 8444 + targetPort: internal + protocol: TCP diff --git a/deploy/orca/dev/01-localstack.yaml.tmpl b/deploy/orca/dev/01-localstack.yaml.tmpl new file mode 100644 index 00000000..87dfcc02 --- /dev/null +++ b/deploy/orca/dev/01-localstack.yaml.tmpl @@ -0,0 +1,83 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: localstack + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: localstack + app.kubernetes.io/part-of: orca-dev +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: localstack + ports: + - name: edge + port: 4566 + targetPort: 4566 + protocol: TCP + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: localstack + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: localstack + app.kubernetes.io/part-of: orca-dev +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: localstack + template: + metadata: + labels: + app.kubernetes.io/name: localstack + app.kubernetes.io/part-of: orca-dev + spec: + containers: + - name: localstack + # 3.8 is community-tier; 'latest' became Pro-only and exits + # with code 55 ("License activation failed"). + image: {{ default "localstack/localstack:3.8" .LocalstackImage | quote }} + imagePullPolicy: IfNotPresent + ports: + - containerPort: 4566 + name: edge + protocol: TCP + env: + - name: SERVICES + value: s3 + - name: DEBUG + value: "0" + - name: PERSISTENCE + value: "0" + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 1 + memory: 1Gi + readinessProbe: + httpGet: + path: /_localstack/health + port: 4566 + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + livenessProbe: + httpGet: + path: /_localstack/health + port: 4566 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + volumeMounts: + - name: data + mountPath: /var/lib/localstack + volumes: + - name: data + emptyDir: {} diff --git a/deploy/orca/dev/02-init-job.yaml.tmpl b/deploy/orca/dev/02-init-job.yaml.tmpl new file mode 100644 index 00000000..41285369 --- /dev/null +++ b/deploy/orca/dev/02-init-job.yaml.tmpl @@ -0,0 +1,81 @@ +--- +# Init Job: creates the cachestore + origin S3 buckets in LocalStack so +# that Orca can pass the versioningGate boot check and so that reviewers +# have an origin bucket to seed sample objects into. Idempotent: +# CreateBucket returns BucketAlreadyOwnedByYou on rerun, swallowed by +# the script. +# +# Cachestore bucket: versioning left unset (the driver unconditionally +# refuses to start against a versioned bucket since If-None-Match: * +# is not honored on versioned buckets). +# Origin bucket: no versioning constraint; sample objects live here. +apiVersion: batch/v1 +kind: Job +metadata: + name: orca-buckets-init + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca + app.kubernetes.io/part-of: orca-dev +spec: + backoffLimit: 6 + template: + metadata: + labels: + app.kubernetes.io/name: orca + app.kubernetes.io/part-of: orca-dev + spec: + restartPolicy: OnFailure + containers: + - name: aws-cli + image: {{ default "amazon/aws-cli:latest" .AwsCliImage | quote }} + env: + - name: AWS_ACCESS_KEY_ID + value: test + - name: AWS_SECRET_ACCESS_KEY + value: test + - name: AWS_DEFAULT_REGION + value: us-east-1 + - name: CACHESTORE_BUCKET + value: {{ default "orca-cache" .CachestoreBucket | quote }} + - name: ORIGIN_BUCKET + value: {{ default "orca-origin" .OriginBucket | quote }} + - name: ENDPOINT + value: http://localstack.{{ default "unbounded-kube" .Namespace }}.svc.cluster.local:4566 + command: + - /bin/sh + - -c + - | + set -e + echo "Waiting for LocalStack at $ENDPOINT ..." + for i in $(seq 1 60); do + if aws --endpoint-url "$ENDPOINT" s3api list-buckets >/dev/null 2>&1; then + echo "LocalStack ready." + break + fi + sleep 2 + done + + ensure_bucket() { + bucket="$1" + echo "Ensuring bucket $bucket (idempotent) ..." + if aws --endpoint-url "$ENDPOINT" s3api head-bucket --bucket "$bucket" >/dev/null 2>&1; then + echo "Bucket $bucket already exists." + else + aws --endpoint-url "$ENDPOINT" s3api create-bucket --bucket "$bucket" + echo "Bucket $bucket created." + fi + } + + ensure_bucket "$CACHESTORE_BUCKET" + ensure_bucket "$ORIGIN_BUCKET" + + # Verify cachestore bucket versioning is unset (Orca's + # versioningGate rejects Enabled or Suspended). + status=$(aws --endpoint-url "$ENDPOINT" s3api get-bucket-versioning --bucket "$CACHESTORE_BUCKET" --query Status --output text 2>/dev/null || echo "None") + echo "Cachestore bucket versioning: $status (None means unset, which is required)." + if [ "$status" = "Enabled" ] || [ "$status" = "Suspended" ]; then + echo "ERROR: cachestore bucket versioning is $status; Orca requires unset/None." + exit 1 + fi + echo "Init complete." diff --git a/deploy/orca/dev/03-azurite.yaml.tmpl b/deploy/orca/dev/03-azurite.yaml.tmpl new file mode 100644 index 00000000..e70209e8 --- /dev/null +++ b/deploy/orca/dev/03-azurite.yaml.tmpl @@ -0,0 +1,117 @@ +--- +# Azurite is Microsoft's official Azure Storage emulator. We use it as +# an alternative origin in the dev harness so reviewers can exercise +# the azureblob origin driver path without a real Azure account. +# +# Well-known dev account/key (documented at +# https://learn.microsoft.com/azure/storage/common/storage-use-azurite): +# AccountName: devstoreaccount1 +# AccountKey: Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== +# BlobURL: http://azurite..svc.cluster.local:10000/devstoreaccount1 +apiVersion: v1 +kind: Service +metadata: + name: azurite + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: azurite + app.kubernetes.io/part-of: orca-dev +spec: + # NodePort so the host-side seeder tool (hack/cmd/orcaseed) can + # reach Azurite without a kubectl port-forward. Kind binds node + # ports to the host's loopback, so the seeder talks to + # http://localhost:/devstoreaccount1/. The fixed port + # (default 30100) sits in the Kubernetes NodePort range + # (30000-32767). Two concurrent dev clusters on the same host + # would collide; override via AzuriteNodePort in the renderer + # invocation if you run more than one. + type: NodePort + selector: + app.kubernetes.io/name: azurite + ports: + - name: blob + port: 10000 + targetPort: 10000 + nodePort: {{ default "30100" .AzuriteNodePort }} + protocol: TCP + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: azurite + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: azurite + app.kubernetes.io/part-of: orca-dev +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: azurite + template: + metadata: + labels: + app.kubernetes.io/name: azurite + app.kubernetes.io/part-of: orca-dev + spec: + containers: + - name: azurite + image: {{ default "mcr.microsoft.com/azure-storage/azurite:3.33.0" .AzuriteImage | quote }} + imagePullPolicy: IfNotPresent + # Bind to 0.0.0.0 so the Service can reach it; default is + # 127.0.0.1. + # --skipApiVersionCheck allows newer Azure SDK clients + # (which advertise API versions Azurite hasn't yet caught up + # with) to talk to it. + # --loose disables strict validation of newer SDK headers. + # --disableProductStyleUrl forces path-style URL parsing. + # Without it, Azurite parses the first DNS label of the Host + # header as the account name (so requests to azurite.... + # would be misinterpreted as account="azurite" rather than + # account="devstoreaccount1"). + # --debug routes Azurite's internal request log to a file; + # tail it via `kubectl exec ... -- cat /tmp/azurite-debug.log` + # when triaging 4xx responses. + args: + - azurite-blob + - --blobHost + - 0.0.0.0 + - --blobPort + - "10000" + - --skipApiVersionCheck + - --loose + - --disableProductStyleUrl + - --debug + - /tmp/azurite-debug.log + - --location + - /data + ports: + - containerPort: 10000 + name: blob + protocol: TCP + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + readinessProbe: + tcpSocket: + port: 10000 + initialDelaySeconds: 3 + periodSeconds: 5 + timeoutSeconds: 3 + livenessProbe: + tcpSocket: + port: 10000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + volumeMounts: + - name: data + mountPath: /data + volumes: + - name: data + emptyDir: {} diff --git a/deploy/orca/dev/04-azurite-init.yaml.tmpl b/deploy/orca/dev/04-azurite-init.yaml.tmpl new file mode 100644 index 00000000..8ad9433f --- /dev/null +++ b/deploy/orca/dev/04-azurite-init.yaml.tmpl @@ -0,0 +1,54 @@ +--- +# Init Job: creates the Azure container in Azurite so Orca's azureblob +# origin driver has somewhere to read from. Idempotent: az container +# create with --fail-on-exist false treats existence as success. +# +# Uses the well-known Azurite dev creds (devstoreaccount1 + the +# documented public key); these are baked into Azurite and not +# secrets. +apiVersion: batch/v1 +kind: Job +metadata: + name: orca-azurite-container-init + namespace: {{ default "unbounded-kube" .Namespace }} + labels: + app.kubernetes.io/name: orca + app.kubernetes.io/part-of: orca-dev +spec: + backoffLimit: 6 + template: + metadata: + labels: + app.kubernetes.io/name: orca + app.kubernetes.io/part-of: orca-dev + spec: + restartPolicy: OnFailure + containers: + - name: az-cli + image: {{ default "mcr.microsoft.com/azure-cli:latest" .AzCliImage | quote }} + env: + - name: AZURE_STORAGE_CONNECTION_STRING + value: "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite.{{ default "unbounded-kube" .Namespace }}.svc.cluster.local:10000/devstoreaccount1;" + - name: CONTAINER + value: {{ default "orca-test" .AzuriteContainer | quote }} + command: + - /bin/sh + - -c + - | + set -e + echo "Waiting for Azurite ..." + for i in $(seq 1 60); do + if az storage container list --output none 2>/dev/null; then + echo "Azurite ready." + break + fi + sleep 2 + done + echo "Ensuring container ${CONTAINER} (idempotent) ..." + if az storage container exists --name "${CONTAINER}" --query exists --output tsv | grep -qi true; then + echo "Container ${CONTAINER} already exists." + else + az storage container create --name "${CONTAINER}" --output none + echo "Container ${CONTAINER} created." + fi + echo "Init complete." \ No newline at end of file diff --git a/deploy/orca/rendered/.gitignore b/deploy/orca/rendered/.gitignore new file mode 100644 index 00000000..f79c394d --- /dev/null +++ b/deploy/orca/rendered/.gitignore @@ -0,0 +1,3 @@ +# rendered manifests are gitignored; produced by `make orca-manifests`. +* +!.gitignore diff --git a/designs/orca/brief.md b/designs/orca/brief.md new file mode 100644 index 00000000..db51c35b --- /dev/null +++ b/designs/orca/brief.md @@ -0,0 +1,206 @@ +# Orca - Origin Cache - Architecture Brief + +A one-screen orientation: what Orca is, the load-bearing +decisions, and the risks. For mechanism and flow, see +[design.md](./design.md). + +## 1. Problem and approach + +Cloud blob storage (AWS S3, Azure Blob) is slow and expensive +when many on-prem clients read from it at once. Orca's target +workload is large immutable artifacts - job inputs, model +weights, training shards - read by thousands of clients with +correlated cold starts. Direct cloud access at that scale is a +cost and latency problem. + +Orca is a read-only S3-compatible HTTP cache that sits inside +the on-prem datacenter as a multi-replica Kubernetes Deployment. +It fronts AWS S3 and Azure Blob, serves chunked bytes keyed by +ETag out of a shared in-DC store, and makes sure the same chunk +is fetched only once no matter how many clients ask for it. +Clients use the same `GetObject` / `HeadObject` / `ListObjectsV2` +calls they already use. + +## 2. Goals and non-goals + +In scope: +- Read-only S3-compatible API: `GetObject` with `Range`, + `HeadObject`, minimal `ListObjectsV2` pass-through. +- Multi-PB working set; thousands of concurrent clients. +- One Orca deployment per datacenter, no cross-DC peering. +- Near-zero origin stampede under correlated cold-access bursts. +- Fast TTFB on both hits and misses. +- Atomic, durable commit of fetched chunks. +- Bounded staleness: at most 5 minutes if an operator overwrites + a key in place (`metadata.ttl`), at most 60 seconds for the + "uploaded after a 404" case (`metadata.negative_ttl`). + Otherwise zero. + +Out of scope: +- Writes, multipart uploads, object versioning. +- Cross-DC peering. +- SigV4 verification (bearer / mTLS hooks exist but nothing + enforces them yet). +- Multi-tenant quotas; per-client / per-IP rate limiting. +- Origin-pushed invalidation (the ETag covers it). +- Encryption at rest beyond what the backing store provides. + +## 3. System at a glance + +A client request lands on one replica, the **assembler**. The +assembler walks the requested byte range chunk by chunk. Hits +read directly from the shared **CacheStore**. Misses go to the +chunk's **coordinator** - the one replica a hash on chunk +identity picks from the headless Service membership. That +coordinator deduplicates concurrent fetches with a per-`ChunkKey` +singleflight, calls the **Origin**, and commits to the +CacheStore in a single no-overwrite write. The coordinator may +be the same replica as the assembler (local fill) or a different +one (called over the internal fill RPC). + +### Diagram A: System overview + +```mermaid +graph TB + subgraph DC["On-prem datacenter"] + Clients["Edge clients"] + Service["Service (ClusterIP / LB)
client traffic"] + subgraph Replicas["orca Deployment"] + R1["Replica 1
:8443 edge
:8444 internal
:8442 ops"] + R2["Replica 2"] + R3["Replica N"] + end + Headless["Headless Service
peer discovery"] + Internal["Internal listener :8444
per-chunk fill RPC"] + Ops["Ops :8442
/healthz, /readyz
(kubelet only)"] + CS[("CacheStore
in-DC S3-compatible")] + end + subgraph Cloud["Cloud origins"] + S3[("AWS S3")] + Azure[("Azure Blob
Block Blobs only")] + end + Clients -- "S3 GET / HEAD / LIST
+ Range" --> Service + Service --> R1 + Service --> R2 + Service --> R3 + R1 -. "DNS refresh
default 5s" .-> Headless + R2 -.-> Headless + R3 -.-> Headless + R1 <--> Internal + R2 <--> Internal + R3 <--> Internal + R1 -.- Ops + R2 -.- Ops + R3 -.- Ops + R1 <--> CS + R2 <--> CS + R3 <--> CS + R1 -- "miss-fill
If-Match: etag" --> S3 + R2 -- "miss-fill
If-Match: etag" --> S3 + R3 -- "miss-fill
If-Match: etag" --> Azure +``` + +## 4. Five load-bearing mechanisms + +### 4.1 Chunking and identity + +Objects are split into fixed-size chunks (8 MiB by default, +tunable). A chunk's name (`ChunkKey`) is +`{origin_id, bucket, object_key, etag, chunk_size, chunk_index}`, +and that name deterministically becomes the chunk's storage +path. The ETag is the key's identity: a new ETag means a new +path, so Orca cannot serve old bytes for a new ETag by +construction. Empty-ETag origin responses are rejected at +`Head`. + +The chunk size is not fixed. For bigger objects the edge picks a +bigger chunk size (8 MiB up to 128 MiB by default, see +`chunking.tiers`), so the per-object request count stays +manageable. The edge also fetches the next few chunks in +parallel while sending the current one to the client +(`chunking.readahead`, default 8). Both knobs help large-blob +throughput without changing how chunks are stored or addressed. + +### 4.2 Singleflight + commit-after-serve + +The coordinator's singleflight collapses many concurrent misses +for the same chunk into a single origin fetch. The leader retries +transient origin errors up to 3 times in 5 seconds before sending +any client headers, releases joiners as soon as the chunk is in +memory and length-checked, and commits to the cachestore in +parallel. A commit failure is invisible to the client: the chunk +just isn't recorded and the next request refills. + +### 4.3 Per-chunk coordinator (rendezvous hashing) + +Each replica polls the headless Service for peer IPs every 5 +seconds and uses a rendezvous hash on chunk identity to pick one +coordinator per chunk. The assembler calls coordinators over the +internal listener (`:8444`, plain HTTP in dev). One client +request that spans N chunks can hit N different coordinators - +that's how Orca spreads hot chunks. Stale routes during +membership churn are caught by an `X-Orca-Internal: 1` header +plus a self-check on the receiver; a mismatch returns 409 and +the caller falls back to filling locally. + +### 4.4 Atomic-commit primitive + +The leader publishes a chunk to the CacheStore in one write that +won't overwrite. `cachestore/s3` uses `PutObject + +If-None-Match: *`; the loser of a race gets 412 and is recorded +as `ErrCommitLost`. At boot the driver runs two checks - a +self-test that proves the precondition is honored, and a +versioning gate that refuses to start on versioned buckets +(several S3-compatible backends ignore `If-None-Match: *` on +them). + +### 4.5 Bounded staleness contract + +Operators promise: once a key is published, its bytes never +change. To change the data, publish a new key. As long as the +promise holds, Orca cannot serve stale bytes (the ETag is in +the chunk's path). If the promise is broken, Orca may serve old +bytes for up to 5 minutes (`metadata.ttl`). That's the +load-bearing correctness statement and must appear in +consumer-API docs. Every `Origin.GetRange` also carries +`If-Match: ` as a safety net. A matching bound applies to +the "uploaded after a 404" case: 60 seconds +(`metadata.negative_ttl`) per replica that saw the original 404. + +## 5. Backing-store options + +One driver ships today: + +- `cachestore/s3` - an in-DC S3-compatible object store (VAST in + production, LocalStack in dev). Atomic-commit primitive is + `PutObject + If-None-Match: *`; the boot self-test and the + versioning gate keep it honest. + +Shared-POSIX-filesystem drivers (`cachestore/posixfs`, +`cachestore/localfs`) were designed and not built. See +[design.md s13](./design.md#13-deferred--future-work). + +## 6. Top risks + +| Risk | What goes wrong | Bound | Detail | +|---|---|---|---| +| Immutable-origin promise | Operator overwrites a key instead of publishing a new one | Up to 5 min stale (`metadata.ttl`) | [s9](./design.md#9-bounded-staleness-contract) | +| Empty-ETag origin | Two versions share a storage path; corrupt reads | Rejected at `Head`; 502 `OriginMissingETag` | [s2](./design.md#2-decisions) | +| Commit-after-serve failure | Client got bytes; cachestore commit failed | Chunk unrecorded; next request refills. Debug logs only today | [s7.7](./design.md#77-failure-handling-without-re-stampede) | +| Approximate origin cap | Scale changes mis-size the cluster-wide cap | Mirror replica count into `cluster.target_replicas` | [s13](./design.md#13-deferred--future-work) | +| Create-after-404 staleness | Upload after a 404 reached a client | Up to 60s per replica (`metadata.negative_ttl`) | [s10](./design.md#10-create-after-404-and-negative-cache-lifecycle) | +| Auth stubbed | Bearer / mTLS hooks not enforced | Rely on NetworkPolicy until built | [s13](./design.md#13-deferred--future-work) | + +## 7. Where to go next + +`design.md` for the full picture: + +- [s2 Decisions](./design.md#2-decisions) +- [s3 Terminology](./design.md#3-terminology) +- [s4 Architecture](./design.md#4-architecture) +- [s7 Stampede protection](./design.md#7-stampede-protection) +- [s8 Atomic commit](./design.md#8-atomic-commit) +- [s9 Bounded staleness contract](./design.md#9-bounded-staleness-contract) +- [s10 Create-after-404](./design.md#10-create-after-404-and-negative-cache-lifecycle) +- [s11 Eviction and capacity](./design.md#11-eviction-and-capacity) +- [s13 Deferred / future work](./design.md#13-deferred--future-work) diff --git a/designs/orca/design.md b/designs/orca/design.md new file mode 100644 index 00000000..4c597d4c --- /dev/null +++ b/designs/orca/design.md @@ -0,0 +1,1290 @@ +# Orca - Origin Cache - Design + +What Orca does, how it does it, and the few decisions that keep it +correct under load. The shorter stakeholder version is in +[brief.md](./brief.md). + +## Table of contents + +1. [Overview](#1-overview) +2. [Decisions](#2-decisions) +3. [Terminology](#3-terminology) +4. [Architecture](#4-architecture) +5. [Chunk model](#5-chunk-model) +6. [Request flow](#6-request-flow) +7. [Stampede protection](#7-stampede-protection) +8. [Atomic commit](#8-atomic-commit) +9. [Bounded staleness contract](#9-bounded-staleness-contract) +10. [Create-after-404 and negative-cache lifecycle](#10-create-after-404-and-negative-cache-lifecycle) +11. [Eviction and capacity](#11-eviction-and-capacity) +12. [Horizontal scale](#12-horizontal-scale) +13. [Deferred / future work](#13-deferred--future-work) + +--- + +## 1. Overview + +Clients inside an on-prem datacenter need to read large files +that live in cloud blob storage (AWS S3, Azure Blob). Letting +every client read from the cloud directly costs too much, +adds too much latency, and pushes too much traffic across the +security boundary. + +Orca sits inside the datacenter and reads from cloud storage on +the clients' behalf. It speaks an S3-compatible HTTP API, so +clients use the same SDKs they already use. On a cache hit it +serves from a shared in-DC store. On a miss it fetches from the +cloud, saves the result, and returns it. + +Orca splits each object into fixed-size chunks (8 MiB by +default). Each chunk's storage path is a hash of the object's +identity (origin, bucket, key, ETag, chunk size). Orca runs as a +multi-replica Kubernetes Deployment. The replicas share one +in-DC store. They find each other through a headless Service. +For any given chunk a single hash picks one replica as the +chunk's "coordinator" - the only replica that's allowed to +fetch that chunk from the cloud. The other replicas ask the +coordinator over a private channel. The result: even if a +thousand clients ask for the same chunk at the same time, the +cloud sees exactly one fetch. + +## 2. Decisions + +| Area | Decision | +|---|---| +| Client API | S3-compatible HTTP. `GET` + `HEAD` + a minimal `ListObjectsV2` pass-through. Range reads work. | +| Auth surface | Bearer / mTLS hooks exist on the edge and the internal listener, but nothing checks them yet. Dev runs with auth off. See s4 and [Deferred / future work](#13-deferred--future-work). | +| Origins | AWS S3 and Azure Blob, behind a pluggable `Origin` interface. | +| Azure constraint | Block Blobs only. Page and Append blobs are rejected at `Head` with `UnsupportedBlobTypeError`. | +| Cachestore | An in-DC S3-compatible store (`cachestore/s3`): LocalStack in dev, VAST or similar in production. Treated as the truth for what chunks exist. | +| Atomic commit | `PutObject` with `If-None-Match: *`. The second concurrent commit gets a `412` and is recorded as `ErrCommitLost`. At boot, `SelfTestAtomicCommit` proves the backend honors the precondition; if it doesn't, the process refuses to start. | +| Versioned cachestore buckets | Not supported. At boot, `GetBucketVersioning` runs; if the bucket has versioning enabled or suspended, the process refuses to start. VAST and several S3-compatible backends ignore `If-None-Match: *` on versioned buckets, which would silently break the atomic-commit rule. | +| Chunking | Default 8 MiB (`chunking.size`). For bigger objects, an optional tier ladder (`chunking.tiers`) picks a larger size: 64 MiB for objects over 1 GiB, 128 MiB for objects over 10 GiB. The chunk size is part of the chunk's storage path, so changing the default or any tier never breaks existing data. Minimum 1 MiB. | +| Read-ahead | While the edge sends one chunk to the client, it can fetch the next few chunks in parallel. The default is 8 in flight. Set `chunking.readahead: 0` to turn it off. | +| Consistency | Operators promise: once a key is published, its bytes never change. To change the data, publish a new key. Orca treats the ETag as the key's identity, not as a freshness check. We also send `If-Match: ` on every fetch as a safety net. If an operator breaks the promise, the wrong data is served for at most 5 minutes (`metadata.ttl`). If a key is uploaded after someone already saw a 404 on it, the wrong 404 is served for at most 60 seconds (`metadata.negative_ttl`). See [s9](#9-bounded-staleness-contract). | +| ETag presence | The origin must return a non-empty ETag on `Head`. If it doesn't, Orca rejects the response with `origin.MissingETagError`. Without an ETag, two different versions of the same `(bucket, key)` would hash to the same storage path and Orca would silently serve old bytes. | +| Catalog | An in-memory LRU (`ChunkCatalog`) that remembers which chunks are in the cachestore. Presence-only - no size or access count. Capped at 100,000 entries by default. | +| Cluster | Kubernetes Deployment + headless Service for peer discovery + ClusterIP / LB for client traffic. A hash on the chunk's identity picks one replica as the chunk's coordinator. The replica that received the client request - the **assembler** - asks the right coordinator for each chunk in the range. On hits, any replica can read the cachestore directly. | +| Internal-listener auth | Config keys exist for mTLS, but nothing enforces them yet. Dev runs with mTLS off. | +| Origin concurrency cap | Each replica caps in-flight origin fetches at `floor(origin.target_global / cluster.target_replicas)` - 64 by default. When the origin throttles (503, 429, retryable 5xx), the leader retries with exponential backoff before sending any HTTP headers, so the client never sees the throttle. | +| Tenancy | One tenant, one set of origin credentials. | +| Listeners | Three: edge `:8443`, internal-fill `:8444`, ops `:8442` (`/healthz`, `/readyz`). All plain HTTP in dev. | +| Repo home | This repo. Code under `internal/orca/`, manifests under `deploy/orca/`, dev harness under `hack/orca/`. | + +## 3. Terminology + +- **Replica** - one running pod of the `orca` Deployment. Replicas + are interchangeable; they hold only in-memory caches. +- **Client** - whoever is calling the S3-compatible HTTP API. +- **Origin** - the upstream cloud store (AWS S3 or Azure Blob). + Orca only reads from it. Interface in + `internal/orca/origin/origin.go`. +- **CacheStore** - the shared in-DC chunk store. The truth for + what's cached. Today this is `cachestore/s3` (an in-DC + S3-compatible object store). Interface in + `internal/orca/cachestore/cachestore.go`; commit rules in + [s8](#8-atomic-commit). +- **Chunk** - one piece of an object. The size is chosen per + request from a small ladder: 8 MiB for small objects, up to 128 + MiB for objects over 10 GiB by default. Orca caches and fills + chunks, not whole objects. +- **ChunkKey** - the chunk's name: + `{origin_id, bucket, object_key, etag, chunk_size, chunk_index}`. + See [s5](#5-chunk-model). +- **Headless Service** - a Kubernetes Service with `clusterIP: None`. + Its DNS A-record returns the IPs of all Ready pods. Orca polls + it every 5s (default) to learn the current peers. +- **Rendezvous hashing** (HRW) - for a key, score every peer with + `hash(peer_ip || key)` and pick the highest score. Stable when + peers come and go: a chunk's owner only changes if its own + owner is added or removed. Orca uses this to pick one + coordinator per chunk. +- **Coordinator** - the replica the hash picks to fetch a chunk + on a miss. One coordinator per chunk, not per request and not + per object. +- **Assembler** - the replica that took the client request. It + walks the requested byte range chunk by chunk. For each chunk + it reads from the cachestore on a hit, or asks the chunk's + coordinator on a miss (locally or over the internal RPC). +- **Singleflight** - a small in-process trick: if a fetch for a + given chunk is already running, new requests for that chunk + wait for the running fetch instead of starting their own. The + first arrival is the **leader**; the rest are **joiners**. See + [s7.1](#71-per-chunkkey-singleflight). +- **Per-chunk internal fill RPC** - + `GET /internal/fill?` over plain HTTP on the + internal listener (`:8444` by default). The assembler calls it + when the coordinator is some other replica. +- **Atomic CacheStore commit** - the write that publishes a chunk + to the cachestore without overwriting anything. `PutObject` with + `If-None-Match: *`. If two replicas race, one wins with `200` + and the other gets `412` (recorded as `ErrCommitLost`). +- **Immutable-origin contract** - operators promise that once + they publish a key, its bytes never change. If they break this, + Orca may serve the old bytes for up to `metadata.ttl`. See + [s9](#9-bounded-staleness-contract). +- **Pre-header retry** - the leader retries a failed + `Origin.GetRange` up to 3 times within 5 seconds before sending + any HTTP header to the client. Transient origin failures stay + invisible. `OriginETagChangedError` is not retried. +- **Negative-cache entry** - a metadata-cache entry that + remembers a `404`, an `UnsupportedBlobTypeError`, or a + `MissingETagError`. Reused for 60 seconds by default + (`metadata.negative_ttl`). +- **S3 versioning gate** - a boot-time `GetBucketVersioning` + check. If the cachestore bucket has versioning enabled or + suspended, Orca refuses to start. +- **MissingETagError** - what the fetch coordinator returns when + the origin's `Head` response has no ETag. Comes back to the + client as a 502 `OriginMissingETag` and is cached negatively. + +## 4. Architecture + +Orca is a single binary deployed as a Kubernetes Deployment. +Replicas discover each other through a headless Service and +refresh the peer list every 5 seconds by default +(`cluster.membership_refresh`). + +A client request lands on one replica, the **assembler**. The +assembler walks the requested byte range chunk by chunk. For +each chunk: + +- If the chunk is in the cachestore, the assembler reads it + directly. Any replica can do this. +- If not, a hash on the chunk's identity picks the **coordinator** + for that chunk. If the coordinator is this replica, the + assembler fetches the chunk locally. If it's some other + replica, the assembler asks that replica over the internal-fill + RPC. + +One tenant. One set of origin credentials per deployment. + +Each replica runs three HTTP listeners: + +- **Edge (`:8443`)** - the S3-compatible client API. Auth is + wired in config but not enforced. Dev runs with + `server.auth.enabled: false`. +- **Internal-fill (`:8444`)** - serves `GET /internal/fill`, the + RPC between replicas. Plain HTTP in dev + (`cluster.internal_tls.enabled: false`). +- **Ops (`:8442`)** - serves `/healthz` (always 200 while the + process is up) and `/readyz` (200 once the cachestore + self-test has passed and the cluster has at least one peer-set + snapshot). Plain HTTP, no auth. Production manifests point the + kubelet probes here; the client Service does not expose this + port. + +### Diagram 1: System overview + +```mermaid +graph TB + subgraph DC["On-prem datacenter"] + Clients["Edge clients"] + Service["Service (ClusterIP / LB)
client traffic"] + subgraph Replicas["orca Deployment"] + R1["Replica 1
:8443 edge
:8444 internal
:8442 ops"] + R2["Replica 2"] + R3["Replica N"] + end + Headless["Headless Service
peer discovery"] + Internal["Internal listener :8444
GET /internal/fill"] + Ops["Ops :8442
/healthz, /readyz
(kubelet only)"] + CS[("CacheStore
in-DC S3-compatible")] + end + subgraph Cloud["Cloud origins"] + S3[("AWS S3")] + Azure[("Azure Blob
Block Blobs only")] + end + Clients -- "S3 GET / HEAD / LIST
+ Range" --> Service + Service --> R1 + Service --> R2 + Service --> R3 + R1 -. "DNS refresh
default 5s" .-> Headless + R2 -.-> Headless + R3 -.-> Headless + R1 <--> Internal + R2 <--> Internal + R3 <--> Internal + R1 -.- Ops + R2 -.- Ops + R3 -.- Ops + R1 <--> CS + R2 <--> CS + R3 <--> CS + R1 -- "miss-fill
If-Match: etag" --> S3 + R2 -- "miss-fill
If-Match: etag" --> S3 + R3 -- "miss-fill
If-Match: etag" --> Azure +``` + +## 5. Chunk model + +A `ChunkKey` is six fields: `{origin_id, bucket, object_key, +etag, chunk_size, chunk_index}`. + +- `origin_id` is a deployment-scoped name from config (e.g. + `aws-us-east-1-prod`). Required. Two Orca deployments can share + the same cachestore bucket without colliding because their keys + start with different `origin_id` values. +- `etag` makes a key's content explicit. A new ETag means a new + logical object: it gets a fresh set of chunks. Old chunks from + the old ETag fall out of the cachestore via lifecycle policy + (see [s11](#11-eviction-and-capacity)). +- `chunk_size` is baked into the storage-path hash, so changing + it in config never corrupts existing data. +- `chunk_index = floor(byte / chunk_size)`. + +A small metadata cache holds `(origin_id, bucket, key) -> ObjectInfo` +with two TTLs: 5 minutes for hits, 60 seconds for misses. Without +it, every request would re-`HEAD` the origin. + +Each chunk's storage path is deterministic: + +`LE64(x)` is the little-endian 8-byte encoding of a 64-bit unsigned +integer, `||` is byte-string concatenation, and `LP(s)` is the +length-prefixed encoding of `s` (its length as `LE64` followed by +its bytes). Length-prefixing each field prevents two distinct +inputs from producing the same hash via boundary ambiguity (e.g. +`("ab", "c")` vs. `("a", "bc")`). + +``` +LP(s) = LE64(uint64(len(s))) || s +hashKey = sha256( + LP(origin_id) || + LP(bucket) || + LP(key) || + LP(etag) || + LE64(chunk_size) + ) +path = "//" +``` + +`origin_id` is in the path in the clear (it's not hashed) so an +operator can delete one deployment's chunks with a single +`aws s3 rm --recursive //`. `chunk_size` goes +into the hash, not the path, so changing it doesn't break +anything visible. + +**What happens if you change `chunk_size`.** Nothing bad. Each +chunk's path is hashed from the chunk size, so old chunks at the +old size never collide with new chunks at the new size. The old +chunks just become unreachable. Plan for two things while the +working set rebuilds at the new size: storage usage roughly +doubles, and origin traffic spikes briefly. The old chunks age +out on their own via the bucket's lifecycle policy. + +### 5.1 Effective chunk size + +Chunk size is not one global number. The edge handler picks it +per request from a base size plus an optional list of tiers. +Each tier says "for objects this big and larger, use this chunk +size." The base covers small objects; tiers kick in at higher +object sizes. + +Default ladder: + +| Object size | Chunk size | +|---|---| +| under 1 GiB | 8 MiB (base) | +| 1 GiB to 10 GiB | 64 MiB | +| over 10 GiB | 128 MiB | + +**Why a ladder.** Small objects don't need big chunks - that +would waste memory per fill. Big objects pay a high price for +small chunks - more HTTP requests, more per-chunk overhead. The +ladder picks a size that fits each object. + +**Why it's safe to change.** Each chunk's storage path includes +the chunk size in its hash. So a chunk written at 8 MiB and a +chunk written at 128 MiB live at different paths and never +overlap. If you change the ladder, old chunks at the old size +simply age out via the bucket lifecycle policy. Nothing gets +corrupted. + +**Why tiers can't overlap.** The config requires tiers to be +sorted by their object-size threshold, with no duplicates. The +loader rejects anything else. So for any object size there is +exactly one matching tier (or the base, if no tier matches). + +**Cross-replica safety.** The peer-to-peer fill RPC sends the +chunk size along with every request (see +[s7.3](#73-cluster-wide-deduplication-via-per-chunk-fill-rpc)). +If two replicas are running with different tier settings during +a rolling deploy, every request is still self-contained - the +receiver uses the size the sender asked for. No coordination is +needed. + +To find a chunk, Orca calls `CacheStore.Stat(key)`. The +`ChunkCatalog` (an in-memory LRU) remembers recent Stat hits so +the hot path skips the cachestore. The catalog is a cache for +the cache: drop it and Orca still works. It stores nothing per +entry beyond "this path is present", because the path already +encodes the chunk's exact identity. If the cachestore later +loses the chunk (e.g. lifecycle deletes it), the next `GetChunk` +returns `ErrNotFound`, the caller calls `Forget`, and the next +request re-stats. + +For a request `Range: bytes=A-B`: + +``` +firstChunk = A / chunk_size +lastChunk = B / chunk_size +for cid := firstChunk; cid <= lastChunk; cid++ { + fetchOrServe(cid) + sliceWithin(cid, max(A, cid*sz), min(B, (cid+1)*sz - 1)) +} +``` + +The loop is streaming: Orca never builds the full list of chunk +keys up front. + +### Diagram 2: Range request -> chunk index mapping + +`SizeFor` below is the tier-ladder lookup described in +[s5.1](#51-effective-chunk-size). + +```mermaid +flowchart LR + Req["GET /bucket/key
Range: bytes=A-B"] --> Math["chunk_size = SizeFor(info.Size)
firstChunk = A / chunk_size
lastChunk = B / chunk_size"] + Math --> Iter["streaming iterator
cid := firstChunk..lastChunk"] + Iter --> Keys["per cid: ChunkKey =
{origin_id, bucket, key,
etag, chunk_size, cid}"] + Keys --> Path["path =
origin_id /
hex(sha256(LP(origin_id) || ...)) /
cid"] + Path --> CS[("CacheStore
address")] +``` + +## 6. Request flow + +A `GET /{bucket}/{key}` arrives, maybe with a `Range` header. +The edge handler does this: + +1. **Get the object's metadata.** Call + `fetch.Coordinator.HeadObject`. It first checks the metadata + cache. On a miss, the per-replica HEAD singleflight runs + `metadata.LookupOrFetch` and calls `Origin.Head` once. An + empty `ETag` in the response is rejected as + `MissingETagError`. Hits live 5 minutes (`metadata.ttl`); + negative cases (`ErrNotFound`, `UnsupportedBlobTypeError`, + `MissingETagError`) live 60 seconds (`metadata.negative_ttl`). +2. **Handle empty objects.** If the object is zero bytes, return + 200 with an empty body right away. A `Range` header on a + zero-byte object is 416. +3. **Parse and check the range.** Validate any `Range` header + against `info.Size`. An unsatisfiable range is 416. +4. Compute the chunk range with `chunk.IndexRange`. +5. **Fetch the first chunk before sending any headers.** Call + `fc.GetChunk(firstKey, info.Size)`, wrap the reader in a + `bufio.Reader`, and `Peek(1)`. If the peek fails - origin + unreachable, auth, ETag changed, missing ETag - the handler + returns a clean S3-style error without ever sending a 200 / + 206. Once that first byte is in hand, the handler sends + headers (`Content-Length`, optional `Content-Range`, `ETag`, + `Content-Type`) and starts streaming. +6. **Stream chunk by chunk.** Stream the first chunk's slice, + then fetch and stream chunks 1..N. If a fetch fails after + headers are out, the response just ends mid-body; S3 SDKs + notice the Content-Length mismatch and retry. +7. **For each chunk**, `fc.GetChunk` first checks the catalog and + the cachestore. A hit returns a reader clamped to + `k.ExpectedLen(info.Size)`. A miss goes to the cluster-wide + dedup path + ([s7.3](#73-cluster-wide-deduplication-via-per-chunk-fill-rpc)). +8. **Cold-path fill.** The leader fetches the chunk from the + origin with pre-header retry, checks the body length against + `ExpectedLen`, buffers it in memory, releases the joiners, and + commits to the cachestore in the background (commit-after- + serve - see [s7.2](#72-singleflight--commit-after-serve)). + +### Diagram 3: Scenario A - warm read (cache hit) + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant R as Replica (assembler) + participant Cat as ChunkCatalog + participant CS as CacheStore + C->>R: GET /bucket/key Range: bytes=A-B + R->>R: HeadObject -> info (metadata cache) + R->>Cat: Lookup(firstChunk) + Cat-->>R: hit + R->>CS: GetChunk(firstChunk, 0, expectedLen) + CS-->>R: bytes (reader) + R->>R: Peek(1) // origin reachability proxy + R-->>C: 200/206 + headers + first slice + loop remaining chunks + R->>Cat: Lookup(k) + Cat-->>R: hit + R->>CS: GetChunk(k) + CS-->>R: bytes + R-->>C: stream slice + end +``` + +A cache hit. The assembler asks the catalog, reads from the +cachestore, and streams to the client. No origin call, no peer +call. + +### Diagram 4: Scenario B - cold miss, local coordinator + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant R as Replica (assembler == coordinator) + participant SF as Singleflight on R + participant O as Origin + participant CS as CacheStore + participant Cat as ChunkCatalog + C->>R: GET /bucket/key Range + R->>R: HeadObject -> info + R->>R: ChunkCatalog miss, then Stat miss + R->>SF: Acquire(k) [leader] + SF->>O: GetRange(..., If-Match: etag)
(pre-header retry) + O-->>SF: full chunk bytes + SF->>SF: validate buf.Len() == ExpectedLen(info.Size) + Note over SF: release joiners (close f.done) + SF-->>R: bytes (in-memory reader over f.bodyBuf) + R->>R: Peek(1), commit headers + R-->>C: 200/206 + headers + body + par commit-after-serve (async vs joiner reads) + SF->>CS: PutChunk(If-None-Match: *) + CS-->>SF: 200 (commit_won) or 412 (commit_lost) + end + alt commit_won + SF->>Cat: Record(k) + else commit_lost + SF->>CS: Stat(k), Record on success + end +``` + +A cold miss where the same replica is both the assembler and the +coordinator. The replica fetches from origin, hands the bytes to +the client, and writes to the cachestore in the background. + +### 6.1 HEAD request flow + +`HEAD /{bucket}/{key}` is served from object metadata. No chunks +are touched. + +1. The edge handler calls `fc.HeadObject`. A metadata-cache hit + returns the cached `ObjectInfo`. A miss runs the per-replica + HEAD singleflight, which issues one `Origin.Head`. +2. On success, return 200 with `Content-Length: info.Size`, + `ETag: info.ETag`, `Content-Type: info.ContentType`, and + `Accept-Ranges: bytes`. +3. Errors reuse the GET error mapping (s6.3). A 404 is cached + negatively. `UnsupportedBlobTypeError` comes back as a 502 + `OriginUnsupported`. `MissingETagError` comes back as a 502 + `OriginMissingETag`. All three are cached negatively. + +### 6.2 LIST request flow + +`GET /{bucket}/?list-type=2&prefix=...` is a thin pass-through to +`Origin.List`. The handler pulls `prefix`, `continuation-token`, +and `max-keys` from the query string, calls the origin, and +turns the result into a minimal `ListBucketResult` XML body. + +This is deliberately narrow. A per-replica LIST cache tuned for +FUSE `ls` workloads is in scope as future work; see +[Deferred / future work](#13-deferred--future-work). + +### 6.3 HTTP error-code mapping + +| Status | S3-style code | Reason | Triggered by | Client retry? | +|---|---|---|---|---| +| 200 / 206 | (none) | normal hit or successful fill | hit + range OK; cold-path fill after pre-header-retry commit | n/a | +| 404 | `NoSuchKey` | origin returned `ErrNotFound` (cached negatively) | edge HEAD / GET miss | no | +| 416 | (text body) | range vs. `info.Size` violation | range math at request entry; or any `Range` against a zero-byte object | no (different range) | +| 502 | `OriginUnsupported` | non-BlockBlob azureblob; from `UnsupportedBlobTypeError` (cached negatively) | `Origin.Head` returns an unsupported blob type | no | +| 502 | `OriginETagChanged` | `OriginETagChangedError` from `Origin.GetRange`; not retried | mid-flight overwrite caught by `If-Match` | yes (next request re-`Head`s) | +| 502 | `OriginMissingETag` | `MissingETagError` from the fetch coordinator (cached negatively) | origin `Head` returned an empty ETag | no (operator must fix the origin config) | +| 502 | `Unauthorized origin` | `origin.ErrAuth` | origin returned 401 / 403 | no (operator) | +| 502 | `OriginUnreachable` | uncategorised origin error (5xx, timeouts past retry budget, DNS) | leader retry budget exhausted; cachestore failure during read | yes (origin may recover) | +| 503 | (probe response) | replica `NotReady` | `/readyz` failing predicates | n/a (LB drain) | +| (mid-stream abort) | n/a | post-header failure | origin disconnect, peer 5xx, cachestore failure after `Peek(1)` succeeded | S3 SDKs detect the Content-Length mismatch and retry | + +Pre-header errors come back as `http.Error` text. The 416 paths +do too. There is no per-error S3-style XML envelope yet; S3 SDKs +accept the text body and route on the HTTP status. Mid-stream +aborts end the response (HTTP/2 `RST_STREAM` or HTTP/1.1 +`Connection: close`). + +### 6.4 Edge read-ahead + +The chunk-by-chunk loop in step 6 of the request flow is not +strictly one-at-a-time. While the edge is sending one chunk to +the client, it can pull the next few chunks from the cachestore +at the same time. The default is up to 8 in flight per client +request. + +**Why this matters.** A 700 GiB object at 128 MiB chunks is +around 5,600 chunks. Without read-ahead, each chunk is fetched, +then sent, then the next is fetched - one round trip after +another. With 8 in flight, most of the per-chunk round-trip time +is hidden behind sending bytes to the client. + +**How it works.** The edge starts a small producer that issues +chunk fetches in order. Each fetch runs in its own worker. +Results come back in chunk order via a small in-memory queue, so +the client always receives bytes in the right order even if a +later worker finishes first. + +**What stays the same.** The first chunk is still fetched and +checked before any response headers go out. If something fails +on chunk 0 - origin down, missing ETag, anything else - the +client gets a clean S3-style error, not a partial body. +Read-ahead only applies to chunks 1..N. Cold fills still go +through the per-replica origin cap +([s7.1](#71-per-chunkkey-singleflight)), so the cluster does not +suddenly issue more origin requests just because read-ahead is +on. Memory stays bounded by the origin cap. + +**What happens on failure.** If a chunk fetch fails after +headers are out, the response just ends - same as before. If +the client disconnects, the producer stops and closes any chunk +bodies it has already pulled, so nothing leaks. If a worker +panics, it is caught, logged, and reported back to the consumer +as a fetch error. + +**Turning it off.** Set `chunking.readahead: 0` to go back to +strict one-at-a-time fetching. + +## 7. Stampede protection + +The hot path. The job here is simple: when many clients ask for +the same chunk at the same time, the origin should see one +fetch, not many. Two mechanisms do this together. + +1. **Inside one replica:** if a fetch for a chunk is already + running, new requests for that chunk wait for the running + fetch instead of starting their own. This is the singleflight. +2. **Across replicas:** a hash on the chunk's identity picks + exactly one replica as the coordinator for that chunk. The + other replicas ask that one over a private channel. So even + across the cluster, only one replica fetches. + +The named seams these mechanisms run through: + +| Seam | File | Role | +|---|---|---| +| `origin.Origin` | `internal/orca/origin/origin.go` (interface); `internal/orca/origin/awss3/`, `internal/orca/origin/azureblob/` | Read-only adapter to the upstream blob store. `If-Match: ` on every `GetRange`. | +| `cachestore.CacheStore` | `internal/orca/cachestore/cachestore.go` (interface); `internal/orca/cachestore/s3/` | In-DC chunk store; source of truth for chunk presence. `PutChunk` is atomic + no-clobber (returns `ErrCommitLost` on conflict). | +| `chunkcatalog.Catalog` | `internal/orca/chunkcatalog/chunkcatalog.go` | Bounded in-memory LRU recording chunks known to be in the cachestore. Presence-only. | +| `cluster.Cluster` | `internal/orca/cluster/cluster.go` | Peer discovery (DNS), rendezvous hashing, internal-fill RPC client + response validator. | +| `fetch.Coordinator` | `internal/orca/fetch/fetch.go` | Per-replica fill orchestrator. Owns the singleflight, the origin semaphore, and the pre-header retry loop. | + +### 7.1 Per-`ChunkKey` singleflight + +The fetch coordinator keeps a map of in-flight fills, keyed on +the chunk's storage path. The map is guarded by a mutex. Each +entry holds a `done` channel, an error slot, and the buffer the +leader will fill. + +Two cases on entry: + +- The map has no entry for this chunk. The caller becomes the + leader, inserts a fresh entry, and runs `runFill` in a + goroutine. +- The map already has an entry. The caller is a joiner. It waits + on the leader's `done` channel. + +Joiners select between their own request context and `<-f.done`. +On release they either return the leader's error or wrap the +leader's buffer in a `bytes.Reader` and stream it. The leader +guarantees the buffer is fully written and length-checked before +it closes `done`, so joiners never see a half-written buffer. + +When `runFill` returns, the leader removes the in-flight entry. +Any request arriving after that point misses the map. By then +the chunk should be in the catalog and the request takes the +hit path. + +### 7.2 Singleflight + commit-after-serve + +What the leader does in `runFill`: + +1. Runs on its own 5-minute context, not the client's. The + cachestore commit then finishes even if every caller has + walked away. The 5-minute ceiling caps how long a zombie fill + can hold resources. +2. Takes a slot from the per-replica origin semaphore. The + semaphore is sized `floor(target_global / target_replicas)`. + Waiting more than `origin.queue_timeout` (default 5s) returns + an error to the caller. +3. Calls `Origin.GetRange` through `fetchWithRetry`. The retry + loop is 3 attempts within 5 seconds, with exponential backoff + capped at 2 seconds. `OriginETagChangedError` and + `origin.ErrNotFound` are not retried. +4. Copies the body into a fresh `bytes.Buffer`. +5. **Checks the length** against `k.ExpectedLen(objectSize)`. A + short body is a hard error. If Orca recorded a short chunk, + later requests would silently get truncated data. So the + leader refuses to commit, hands the error to the joiners, and + lets the next request try again. +6. Stores the buffer on the fill entry and **releases joiners** + (closes `f.done`, wrapped in a `sync.Once` so it fires + exactly once) **before** writing to the cachestore. +7. Writes to the cachestore via `PutObject` with + `If-None-Match: *`. +8. On success, records the chunk in the catalog. +9. On `ErrCommitLost` (the 412 from the cachestore), another + replica won the race. Stat the existing entry and record it + in the catalog on success. +10. On any other error, log it and move on. The chunk is not + recorded; the next request refills (one extra origin GET in + the worst case). The client never sees this error because the + response already went out. + +Releasing joiners before the commit matters for cold-path +time-to-first-byte. Joiners get their bytes as soon as the +origin delivered them. Without the reorder, joiners would wait +for both the origin round-trip and the cachestore commit +round-trip before seeing any data. + +The buffer-write, validate, release-joiners, then commit +sequence is safe because `bytes.Buffer`'s underlying slice +doesn't change after the final `io.Copy`. So joiners' reads of +`buf.Bytes()` and the cachestore `PutChunk`'s read of the same +slice are independent reads of an unchanging region. + +There is no on-disk spool and no tee. The full chunk lives in +memory until the commit returns. Peak memory per fill is one +chunk (8 MiB by default). With the per-replica origin cap at 64, +the worst-case buffer footprint per replica is around 512 MiB +under full saturation. + +### 7.3 Cluster-wide deduplication via per-chunk fill RPC + +A hash on the chunk's identity picks one coordinator from the +current peer set. The replica that took the client request is +the assembler. For each chunk in the requested range: + +- **Hit** (the catalog or `Stat` says the chunk is there): the + assembler reads from the cachestore directly. No internal RPC. +- **Miss, this replica is the coordinator:** run the local + singleflight ([s7.1](#71-per-chunkkey-singleflight)) and commit + ([s7.2](#72-singleflight--commit-after-serve)). +- **Miss, some other replica is the coordinator:** the assembler + calls `GET /internal/fill?` on that replica's + internal listener ([s7.4](#74-internal-rpc-listener)). The + coordinator runs the singleflight + commit path locally and + streams the bytes back. The assembler stitches the bytes into + the client response, slicing the first and last chunks to + match the client's `Range`. + +**Loop prevention.** The assembler sets `X-Orca-Internal: 1` on +internal RPCs. The internal handler checks +`Cluster.IsCoordinator(k)`. If the receiving replica disagrees +(peer membership has shifted), it returns 409 with +`{"reason":"not_coordinator"}`. `FillFromPeer` recognizes this +as `cluster.ErrPeerNotCoordinator` and the caller falls back to +filling locally. The loser of the resulting commit race gets +`ErrCommitLost`. Internal RPCs are never forwarded. + +**Wire format.** +`GET /internal/fill?origin_id=...&bucket=...&key=...&etag=...&chunk_size=N&index=N&object_size=N`. +`DecodeChunkKey` requires `chunk_size > 0`, `index >= 0`, +`object_size > 0`, and a non-empty `origin_id` and `key`. +Anything else is a 400. + +**Response framing.** The coordinator sets `Content-Length` to +`ExpectedLen(objectSize)` and `Content-Type` to +`application/octet-stream`. The caller wraps the response body +in a `validatingReader` that checks the actual byte count +against the advertised length. If they disagree it returns +`io.ErrUnexpectedEOF`. This catches truncated cross-replica +responses. + +### Diagram 5: Scenario D - cold miss, remote coordinator + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant A as Replica A (assembler) + participant B as Replica B (coordinator for k) + participant SF as Singleflight on B + participant O as Origin + participant CS as CacheStore + C->>A: GET /bucket/key Range + A->>A: rendezvous(k, peers) -> B + A->>B: GET /internal/fill?...&object_size=N
X-Orca-Internal: 1 + B->>B: IsCoordinator(k)? yes + B->>SF: Acquire(k) [leader] + SF->>O: GetRange(..., If-Match: etag)
(pre-header retry) + O-->>SF: full bytes + SF->>SF: validate buf.Len() == ExpectedLen + SF-->>B: bytes (in-memory) + B-->>A: 200 + Content-Length + stream
(validatingReader on A's side) + A-->>C: stream sliced bytes + par async commit-after-serve on B + SF->>CS: PutChunk(If-None-Match: *) + CS-->>SF: commit_won or commit_lost + end + Note over A,B: 409 from B -> A falls back to local fill +``` + +A cold miss where the coordinator is a different replica. The +assembler hands the work off, streams the bytes through, and the +coordinator commits in the background. A 409 from the +coordinator means peer membership has shifted; the assembler +falls back to filling locally. + +### 7.4 Internal RPC listener + +The per-chunk fill RPC runs on its own port (default `:8444`, +config `cluster.internal_listen`). That keeps cross-replica +traffic off the client edge. + +In dev the listener is plain HTTP/2. Config keys exist for mTLS +(`cluster.internal_tls.{enabled, cert_file, key_file, ca_file, server_name}`) +but nothing enforces them yet. Production deployments rely on +Kubernetes NetworkPolicy or equivalent to isolate the port, not +on TLS at the listener. + +Loop prevention: the listener requires `X-Orca-Internal: 1` and +checks `Cluster.IsCoordinator(k)`. Disagreement returns 409. + +The listener serves only `GET /internal/fill`. Health and +readiness probes are on the ops listener; the client S3 API is +on the edge listener. + +### 7.5 Metadata-layer singleflight + +Same pattern, at the metadata cache. +`metadata.LookupOrFetch` maps each `(origin_id, bucket, key)` +to a singleflight entry. So a flood of distinct cold keys +generates at most one `Origin.Head` per object per replica per +`metadata.ttl` window. Across the cluster that's up to N HEADs +per object per window, where N is the peer count. A +cluster-wide HEAD coordinator is future work. + +The entry is removed from the map **before** its `done` channel +is closed, so a caller arriving in that brief window starts a +fresh fetch instead of getting the old entry's cached error. +The trade-off: under contention you might pay one extra HEAD +per miss. In exchange a transient HEAD error never gets +replayed to a later caller. + +### 7.6 Cancellation safety + +`runFill` runs on its own 5-minute context, so it finishes +even when every caller has disconnected. The origin slot is +released when `runFill` returns. A joiner that cancels only +cancels itself (it `select`s between its context and +`f.done`). + +If the leader's 5-minute context fires, the fill fails for the +joiners too. Worst case Orca wasted one fill's worth of work, +and the next request triggers a fresh one. + +### 7.7 Failure handling without re-stampede + +How each kind of failure is handled: + +- **Retryable origin errors during pre-header retry.** The + leader retries up to `origin.retry.attempts` (default 3) + within `origin.retry.max_total_duration` (default 5s), with + exponential backoff (`origin.retry.backoff_initial=100ms`, + `origin.retry.backoff_max=2s`). All this happens before any + HTTP header is sent, so the client never sees the transient + failure. If the budget runs out, the client gets a 502 + `OriginUnreachable`. +- **`OriginETagChangedError`.** Not retried. The leader + invalidates the metadata cache entry for + `(origin_id, bucket, key)` and returns the error. The next + request re-`Head`s, sees the new ETag, builds a new + `ChunkKey`, and refills under the new path. +- **`origin.ErrNotFound`.** Not retried. Cached negatively for + `metadata.negative_ttl`. The client gets a 404. +- **`UnsupportedBlobTypeError` / `MissingETagError`.** Not + retried. Cached negatively. The client gets a 502. +- **Short body from the origin.** Hard error. `runFill` rejects + a body that doesn't match `ExpectedLen(objectSize)`. The fill + fails, the joiners see the error, and the catalog is not + updated. This is what stops a short fetch from poisoning the + catalog. +- **Commit failure after the response is gone** + (`PutChunk` returns something other than `nil` or + `ErrCommitLost`). The client already has the bytes, so the + failure is invisible to them. The chunk is not recorded; the + next request will refill. A sustained rate of this is a + cachestore-health problem; today it's only visible in the + structured debug logs. +- **CacheStore `ErrTransient` / `ErrAuth` during a read.** The + client gets a 502. Orca does not auto-refill, because that + would just hammer a backend that's already struggling. + +## 8. Atomic commit + +The leader publishes a chunk to the cachestore in one step that +won't overwrite anything: `PutObject` with `If-None-Match: *`. +The second concurrent commit for the same key gets HTTP 412 and +is recorded as `ErrCommitLost`. So when two replicas race to +fill the same chunk, exactly one wins; the loser treats the +existing object as the truth. + +Joiners don't wait for the commit +([s7.2](#72-singleflight--commit-after-serve)). They're released +as soon as the leader's buffer is full and length-checked. The +`PutChunk` RPC runs in parallel with the joiners' reads. If the +commit fails, the client never knows; Orca just doesn't record +the chunk, and the next request refills. + +**Boot-time self-test (`SelfTestAtomicCommit`).** At startup the +`cachestore/s3` driver writes a probe key, then writes the same +probe key again with `If-None-Match: "*"` and expects a 412. If +the second write returns 200 (the backend silently overwrote), +the driver refuses to start. This catches backends that don't +implement the precondition. Verified backends today: AWS S3 +(since 2024-08), MinIO, VAST Cluster (only on non-versioned +buckets). + +**Boot-time versioning gate.** The driver also runs +`GetBucketVersioning(bucket)`. If versioning is `Enabled` or +`Suspended`, startup fails with a clear error. VAST and several +S3-compatible backends ignore `If-None-Match: *` on versioned +buckets, which would silently break the atomic-commit rule. + +## 9. Bounded staleness contract + +Orca relies on a promise from the operator. It also caps the +damage if the operator breaks the promise. + +### 9.1 The contract and the staleness window + +**The contract.** For any `(origin_id, bucket, object_key)`, the +bytes never change once published. To change the data, publish +a new key. Overwriting in place is breaking the promise. + +**Why this is enough.** The chunk's storage path includes its +ETag (s5). New ETag, new path. So as long as operators publish +new bytes under new keys, Orca cannot serve old bytes for a new +key. + +**What happens if the promise is broken.** For up to 5 minutes +(the default `metadata.ttl`), Orca may serve the old bytes. +Here's why: + +- Object metadata (`size`, `etag`, `content_type`) is cached for + `metadata.ttl` so Orca doesn't re-`HEAD` on every request. +- During that window, every request looks up the cached ETag, + builds the old `ChunkKey`, and serves from the old chunks. +- When the window expires, the next request does a fresh `Head`, + sees the new ETag, builds a new `ChunkKey`, and refills. + +**Why this is OK for the target workload.** Orca is built for +large immutable artifacts (job inputs, model weights, training +shards). Those naturally fit the contract. The 5-minute window +is the worst case, not the normal case. A new key gets the right +ETag right away. + +**Safety net.** Every `Origin.GetRange` sends `If-Match: `. +If an in-flight fetch races with an in-place overwrite, the +origin returns 412 `PreconditionFailed`. The leader fails the +fill and invalidates the metadata cache entry. This catches the +narrow case where a violation happens between the `Head` and the +`GetRange`. It does **not** catch a violation between two +separate request lifecycles inside the same `metadata.ttl` +window. The `metadata.ttl` cap is what bounds that case. + +## 10. Create-after-404 and negative-cache lifecycle + +### 10.1 The scenario + +The "I forgot to upload that" case. A client asks for key `K`. +The origin doesn't have it yet. Orca caches the 404 and returns +it. Then the operator uploads `K`. Orca keeps returning 404 +until the cached 404 expires. + +From the client's view, this looks the same as the operator +breaking the no-overwrite rule (s9): the bytes for `K` changed +without Orca knowing. There is no origin-to-cache invalidation, +so all Orca can do is cap how long it serves the stale 404. + +### 10.2 Asymmetric TTLs + +The metadata cache uses two TTLs: + +| TTL | Default | Bounds | Why | +|---|---|---|---| +| `metadata.ttl` | 5m | how long Orca trusts a `200 + ETag` without re-`HEAD`ing | the contract holds in normal use, so trusting it longer cuts origin HEAD load | +| `metadata.negative_ttl` | 60s | how long Orca trusts a `404`, `UnsupportedBlobTypeError`, or `MissingETagError` | operators do upload keys that someone already tried to fetch, so recovery should be quick | + +The two timeouts are different on purpose. The 5-minute timeout +only matters if the operator breaks the no-overwrite rule. The +60-second timeout matters every time someone uploads a key that +a client already saw a 404 on - a normal thing that happens. + +The per-replica HEAD singleflight (s7.5) keeps the short +negative TTL from creating HEAD storms. A flood of distinct +missing keys produces at most one HEAD per object per replica +per `metadata.negative_ttl`. At defaults (60s, 3 replicas) the +origin sees at most 3 HEADs per missing key per minute, well +under any documented S3 / Azure rate limit. + +### 10.3 Worst-case unavailability window + +After an operator uploads a key that someone already tried to +fetch: + +- A replica that saw the original 404 keeps serving 404 for up + to `metadata.negative_ttl` from when **it** saw the 404, not + from when the upload happened. Orca has no way to know when + the upload happened. +- A replica that did not see the 404 will `Head` fresh on the + first request and serve 200 right away. +- Worst case across the cluster: `metadata.negative_ttl` after + the last replica's original 404. Under round-robin load + balancing, clients can see 404 and 200 alternating during the + drain. + +There is no way to actively invalidate (no origin push, no +admin RPC). The workaround: after an upload, wait +`metadata.negative_ttl` before telling anyone the key exists. + +### Diagram 6: Scenario G - create-after-404 timeline + +```mermaid +sequenceDiagram + autonumber + participant Op as Operator + participant C as Client + participant A as Replica A + participant B as Replica B + participant O as Origin + Note over A,B: t=0 K not yet uploaded + C->>A: GET /bucket/K + A->>O: Head(K) + O-->>A: 404 + Note over A: cache K -> 404
TTL = metadata.negative_ttl (60s) + A-->>C: 404 + Note over Op,O: t=30s operator uploads K + Op->>O: PUT /bucket/K + Note over A,B: t=45s drain window + C->>B: GET /bucket/K (LB routes to B) + B->>O: Head(K) + O-->>B: 200 + ETag + B->>O: GetRange (fill path) + O-->>B: bytes + B-->>C: 200 + bytes + Note over A,B: inconsistent results across replicas during drain + C->>A: GET /bucket/K (LB routes to A again) + Note over A: negative entry still valid
age 45s less than 60s + A-->>C: 404 STALE + Note over A: t=60s+ negative entry expires + C->>A: GET /bucket/K (t=70s) + A->>O: Head(K) + O-->>A: 200 + ETag + A->>O: GetRange (fill path) + O-->>A: bytes + A-->>C: 200 + bytes + Note over A,B: drain complete - replicas consistent +``` + +A timeline of the drain. Replica A saw the 404; replica B did +not. During the window between the upload and the cache expiry, +clients can get a 200 from B and a 404 from A on the same key. + +## 11. Eviction and capacity + +### 11.1 Passive eviction (lifecycle) + +Eviction is the cachestore's job, not Orca's. The recommended +setup is age-based expiration on the chunk prefix, with the +expiry chosen to fit the working set in the available capacity. +Storage paths start with `origin_id`, so an operator can set a +different lifecycle for each deployment that shares a bucket. + +For AWS S3, MinIO, and VAST, the bucket lifecycle policy handles +this. Configure it on the bucket. + +The `cachestore.CacheStore` interface has a `Delete(k)` method, +but production code doesn't call it. The method is there so a +future active-eviction loop can use it; see +[Deferred / future work](#13-deferred--future-work). + +### 11.2 ChunkCatalog size + +The catalog is capped by `chunk_catalog.max_entries` (default +100,000). Each entry is roughly 80 bytes (the path string plus a +list pointer), so the default is about 8 MB per replica. +Operators with very large active working sets should size the +catalog to a multiple of the expected chunk count (working set / +chunk size). + +A catalog smaller than the working set is still correct, just +slower: cold lookups fall through to `CacheStore.Stat`. The +cachestore is always the truth. + +### 11.3 `chunk_size` config-change capacity impact + +Changing `chunk_size` orphans the old chunks (s5). Storage +roughly doubles for a while as the working set rebuilds at the +new size. The bucket lifecycle policy ages the orphaned chunks +out. + +### 11.4 Per-fill memory + +Peak memory per fill is one chunk, at whatever size the tier +ladder picked for that object. With the default ladder, that's +8 MiB for small objects, up to 128 MiB for objects over 10 GiB. + +The per-replica origin cap is +`floor(target_global / target_replicas)`. On a 4-replica cluster +with `target_global = 64`, that's 16 concurrent fills. + +So the worst case per replica is `16 fills * 128 MiB = 2 GiB` of +in-flight chunk buffers when many large objects are being filled +at the same time. + +Operators with tighter memory budgets should remove the top tier +or lower its chunk size. Read-ahead does not change this number +- the cap on cold fills is what bounds memory. + +## 12. Horizontal scale + +Cluster membership comes from the headless Service. A DNS +A-record lookup returns the IPs of all Ready pods. The cluster +package polls that list every `cluster.membership_refresh` +(default 5s), and the hash on chunk identity picks a coordinator +per chunk. The assembler reads from the cachestore on a hit, +runs the local singleflight if it's the coordinator, or calls +`GET /internal/fill?` otherwise. + +Pod names are not stable under a Deployment. Orca addresses +peers only by IP, not by name. + +The cachestore stores one copy of each chunk. If a chunk is lost, +Orca refills from the origin. Every replica can read every +chunk; no replica owns any bytes, so losing a replica never +strands data. + +**What happens if the peer set is empty.** If `Cluster.Peers()` +comes back empty - the Service has no Ready endpoints, DNS +returns NXDOMAIN, or CoreDNS is broken - the replica treats +itself as the only peer. The hash picks self for every chunk and +every fill runs locally. Orca keeps serving; the only loss is +that cluster-wide dedup falls back to per-replica dedup until +DNS recovers. No process restart is needed. + +**What happens when a refresh fails.** On a DNS error or peer- +source error, the cluster keeps the previous (non-empty) peer +list rather than wiping it to `[Self]`. After 5 failures in a +row (`maxStalePeerRefreshes`) it falls back to `[Self]`. That +bounds how long Orca routes to dead peers. A `context.Canceled` +during graceful shutdown doesn't count toward the streak. + +**`/readyz` predicate.** `/readyz` only flips to 200 after at +least one successful peer-set snapshot. So if DNS is broken end +to end the replica stays `NotReady` and gets drained, even +though the empty-peer fallback would otherwise let it serve. + +**Rolling restarts.** Pod IPs change during a rolling restart, +and the new IPs take up to `cluster.membership_refresh` to +propagate. During that window the assembler and the new replica +can disagree on who owns a chunk. The assembler routes to a +stale IP and either gets `connection refused` (and falls back to +filling locally) or reaches the wrong replica (which returns 409 +`not_coordinator`, and the assembler falls back). Either way, +the loser of the resulting commit race gets `ErrCommitLost`. No +duplicate bytes are written. + +### Diagram 7: Membership flux during rolling restart + +```mermaid +sequenceDiagram + autonumber + participant A as Replica A + participant DNS as headless Service DNS + participant B as Replica B (old IP) + participant Bp as Replica B' (new IP) + participant CS as CacheStore + Note over A,B: t=0 peers (A's view) = {A, B}
chunk k owned by B + A->>DNS: refresh + DNS-->>A: [ip(A), ip(B)] + Note over B,Bp: t=5s rolling restart: B terminates,
B' starts with a new IP + Note over A: A's cached membership still {A, B}
until next refresh + A->>A: rendezvous(k, {A,B}) = B (stale) + A->>B: /internal/fill (connection refused) + A->>A: fallback: fill locally + A->>CS: PutChunk(If-None-Match: *) + Note over Bp: B' bootstraps, refreshes DNS
peers (B's view) = {A, B'} + Bp->>Bp: rendezvous(k, {A,B'}) = B' + Bp->>CS: PutChunk(If-None-Match: *) + CS-->>A: 200 commit_won + CS-->>Bp: 412 commit_lost (ErrCommitLost) + Note over A,Bp: at-most-one duplicate fill per chunk + Note over A,DNS: t=10s A refreshes DNS
peers converge to {A, B'}
steady state restored +``` + +A walks through B being replaced by B'. A still thinks B owns +chunk k, tries B's old IP, fails, and fills locally. Meanwhile +B' boots, decides it owns k, and fills too. Both write to the +cachestore. The atomic-commit rule means only one write sticks; +the other gets `ErrCommitLost`. No corruption. + +## 13. Deferred / future work + +Things considered and not built. None requires breaking +existing interfaces. Build each when there's measured evidence +that justifies the extra surface area. + +### Auth enforcement on edge and internal listeners + +The edge handler checks `cfg.Server.Auth.Enabled` and returns +401 if it's true, but nothing actually checks bearer tokens or +mTLS client certs. The internal listener takes plain HTTP/2 in +dev; the `cluster.internal_tls.*` config keys are read but +nothing does the TLS handshake. Production deployments rely on +Kubernetes NetworkPolicy (or equivalent network isolation) +today. + +Building this means: a real bearer-token check (HMAC against a +Kubernetes Secret), mTLS plumbing on both listeners with +separate trust roots, and a peer-IP check on the internal +listener. + +### Posix-shared cachestore drivers + +`cachestore/posixfs` (shared POSIX filesystems: NFSv4.1+, Weka +native, CephFS, Lustre, GPFS) and `cachestore/localfs` (dev) +were designed and not built. The atomic-commit primitive there +is `link()` returning `EEXIST` (or +`renameat2(RENAME_NOREPLACE)`). The posixfs flavor adds backend +detection, an NFS minimum-version check, refusal on Alluxio +FUSE, and a 2-character hex path fan-out. Both would share +helpers via `internal/orca/cachestore/internal/posixcommon/`. + +These would let Orca run against shared-filesystem deployments +that don't have an in-DC S3-compatible object store. The +`SelfTestAtomicCommit` hook on `CacheStore` is already shaped to +absorb them. + +### Prometheus metrics + +There are no Prometheus collectors yet. The diagnostic surface +today is structured `slog` output (debug-level traces through +every chunk-resolution decision, switchable via +`logging.level` or `ORCA_LOG_LEVEL`). + +The metric families that would matter: +- `orca_origin_*` (HEAD / GetRange counts, retry outcomes, + duplicate fills, ETag-changed). +- `orca_cachestore_*` (put / get / stat counts, commit + outcomes). +- `orca_commit_after_serve_total{ok|failed}`. +- `orca_origin_inflight` (per-replica origin semaphore gauge). +- `orca_fills_inflight` (per-replica singleflight map size). +- `orca_cluster_*` (peer-set size, refresh outcomes, internal- + fill duration, direction, 409 rate). +- `orca_metadata_*` (positive / negative counts and ages). +- `orca_chunk_catalog_hit_rate`. + +A Grafana dashboard is part of the work. + +### CacheStore circuit breaker + +A per-process circuit breaker around cachestore calls. Sustained +`ErrTransient` or `ErrAuth` would short-circuit writes so Orca +doesn't keep hammering a backend that's already in trouble. +Defaults considered: 10 errors per 30s window, 30s open, 3 +half-open probes. It would also flip `/readyz` to `NotReady` on +sustained `ErrAuth`, and gate any future active-eviction loop's +`Delete` calls. + +### LIST cache and cluster-wide LIST coordinator + +The LIST handler is a pass-through today. A per-replica LIST +cache keyed on +`(origin_id, bucket, prefix, continuation_token, start_after, delimiter, max_keys)` +would absorb FUSE `ls` workloads (`list_cache.ttl=60s` default, +`list_cache.max_entries=1024`). A cluster-wide LIST coordinator +on the same query tuple is the next step. Both need +`409`-fallback semantics like the chunk-fill coordinator. + +### Active eviction loop + +An opt-in background loop +(`chunk_catalog.active_eviction.enabled`) that uses +access-frequency tracking on the catalog to `CacheStore.Delete` +cold chunks. Requires extending the catalog to record +`AccessCount`, `LastAccessed`, and `LastEntered` per entry. The +`Delete` method on `CacheStore` exists for this. Useful for +posixfs deployments that don't have external sweep tooling. + +### Bounded-freshness mode + +An opt-in (`metadata_refresh.enabled`) per-replica background +loop that re-`Head`s hot keys before `metadata.ttl` expires. +That shrinks the effective staleness window for popular keys +from `metadata.ttl` to `refresh_ahead_ratio * metadata.ttl` +(e.g. 3.5 minutes). Hot-key detection uses access counters on +the metadata cache. + +### Cluster-wide HEAD singleflight + +A second coordinator role (`Cluster.HeadCoordinator(ObjectKey)`) +alongside the chunk-fill coordinator. With it, the cluster does +exactly one `Origin.Head` per object per `metadata.ttl` window +instead of N. Only justified at much larger peer-set sizes than +the documented 3-5 replicas. + +### Coordinated cluster-wide origin limiter + +A Kubernetes-Lease-elected authority that hands out slot-lease +tokens to peers, replacing the per-replica static cap with a +true cluster-wide cap on `Origin.GetRange` calls. Lots of moving +parts (election, slot-lease tokens, batching, fallback mode, +RBAC). Only worth it when the peer set grows past 10-ish and +individual replicas show sustained slot under-utilization. + +### Dynamic per-replica origin cap + +Compute `target_per_replica` at runtime from +`len(Cluster.Peers())` instead of from the static +`cluster.target_replicas` config knob. Helpful for HPA-driven +autoscaling, or when operators routinely change replica count +and forget to update the config. + +### Mid-stream origin resume + +Today, if the origin disconnects after Orca has sent any bytes +to the client, the response just ends; S3 SDKs retry from +scratch. A resume path would re-issue `Origin.GetRange` with +`Range: bytes=-` and keep feeding the client invisibly. +Trade-off: real state-tracking work, plus interaction with the +singleflight joiners. SDK retry already handles this case. + +### Per-request correlation IDs + +Threading a request-scoped logger through every fetch coordinator +method needs ctx propagation in a lot of places. The shared +`slog.Group("chunk", ...)` taxonomy plus `AddSource: true` +already give cross-package correlation by chunk identity. + +### Orphan-chunk garbage collection + +When an origin ETag rotates, the old chunks under +`//...` stay in the cachestore until the +bucket lifecycle policy deletes them. The atomic-commit rule +means there's no corruption; the only cost is storage growth in +proportion to the rotation rate. A real GC would walk the +cachestore and remove chunks whose +`(origin_id, bucket, key, etag)` no longer matches the current +origin `Head`. That's a lot of code for a problem that +lifecycle policies already handle in production. + +### Singleflight context propagation + +If the leader's request context cancels, the joiners get the +leader's error rather than continuing to wait on the fill (which +is on its own 5-minute context anyway). Self-healing on the +next request. Fixing this means restructuring the singleflight +join to outlive the leader's caller; a lot of work for a small +TTFB win. + +### Origin-semaphore starvation under cancellation storms + +A flood of cancelled requests can briefly hold origin slots +between acquire and the deferred release. Operational concern +only; no observed incident. Need metrics first. diff --git a/go.mod b/go.mod index 9fdc87a3..49794bf4 100644 --- a/go.mod +++ b/go.mod @@ -26,6 +26,11 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4 github.com/Masterminds/semver/v3 v3.4.0 github.com/Masterminds/sprig/v3 v3.3.0 + github.com/aws/aws-sdk-go-v2 v1.41.7 + github.com/aws/aws-sdk-go-v2/config v1.32.17 + github.com/aws/aws-sdk-go-v2/credentials v1.19.16 + github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0 + github.com/aws/smithy-go v1.25.1 github.com/bougou/go-ipmi v0.8.3 github.com/cilium/ebpf v0.21.0 github.com/coder/websocket v1.8.14 @@ -49,6 +54,7 @@ require ( github.com/spf13/cobra v1.10.2 github.com/spf13/pflag v1.0.10 github.com/stretchr/testify v1.11.1 + github.com/testcontainers/testcontainers-go v0.42.0 github.com/vishvananda/netlink v1.3.1 golang.org/x/crypto v0.50.0 golang.org/x/mod v0.35.0 @@ -73,27 +79,51 @@ require ( ) require ( - dario.cat/mergo v1.0.1 // indirect - github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1 // indirect + dario.cat/mergo v1.0.2 // indirect + github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect - github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect + github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect + github.com/Microsoft/go-winio v0.6.2 // indirect github.com/apex/log v1.9.0 // indirect + github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 // indirect + github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/containerd/errdefs v1.0.0 // indirect + github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/containerd/log v0.1.0 // indirect + github.com/cpuguy83/dockercfg v0.3.2 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/cyphar/filepath-securejoin v0.5.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/distribution/reference v0.6.0 // indirect + github.com/docker/go-connections v0.6.0 // indirect github.com/docker/go-units v0.5.0 // indirect github.com/dustin/go-humanize v1.0.1 // indirect + github.com/ebitengine/purego v0.10.0 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-errors/errors v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect @@ -110,12 +140,14 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/josharian/native v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.18.0 // indirect + github.com/klauspost/compress v1.18.5 // indirect github.com/klauspost/pgzip v1.2.6 // indirect github.com/kr/pretty v0.3.1 // indirect github.com/kr/text v0.2.0 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/magiconair/properties v1.8.10 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-isatty v0.0.20 // indirect @@ -125,10 +157,16 @@ require ( github.com/mdlayher/socket v0.5.1 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect + github.com/moby/docker-image-spec v1.3.1 // indirect + github.com/moby/go-archive v0.2.0 // indirect + github.com/moby/moby/api v1.54.1 // indirect + github.com/moby/moby/client v0.4.0 // indirect + github.com/moby/patternmatcher v0.6.1 // indirect github.com/moby/spdystream v0.5.1 // indirect + github.com/moby/sys/sequential v0.6.0 // indirect github.com/moby/sys/user v0.4.0 // indirect github.com/moby/sys/userns v0.1.0 // indirect - github.com/moby/term v0.5.0 // indirect + github.com/moby/term v0.5.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect @@ -145,6 +183,7 @@ require ( github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect @@ -153,10 +192,13 @@ require ( github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect + github.com/shirou/gopsutil/v4 v4.26.3 // indirect github.com/shopspring/decimal v1.4.0 // indirect - github.com/sirupsen/logrus v1.9.3 // indirect + github.com/sirupsen/logrus v1.9.4 // indirect github.com/sony/gobreaker/v2 v2.4.0 // indirect github.com/spf13/cast v1.7.0 // indirect + github.com/tklauser/go-sysconf v0.3.16 // indirect + github.com/tklauser/numcpus v0.11.0 // indirect github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 // indirect github.com/urfave/cli v1.22.12 // indirect github.com/vbatts/go-mtree v0.6.1-0.20250911112631-8307d76bc1b9 // indirect @@ -164,6 +206,12 @@ require ( github.com/x448/float16 v0.8.4 // indirect github.com/xlab/treeprint v1.2.0 // indirect github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect + go.opentelemetry.io/otel v1.41.0 // indirect + go.opentelemetry.io/otel/metric v1.41.0 // indirect + go.opentelemetry.io/otel/trace v1.41.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect @@ -172,7 +220,7 @@ require ( golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/telemetry v0.0.0-20260409153401-be6f6cb8b1fa // indirect golang.org/x/text v0.36.0 // indirect - golang.org/x/time v0.9.0 // indirect + golang.org/x/time v0.11.0 // indirect golang.org/x/tools v0.44.0 // indirect golang.org/x/vuln v1.2.0 // indirect golang.zx2c4.com/wireguard v0.0.0-20231211153847-12269c276173 // indirect diff --git a/go.sum b/go.sum index 91bab086..3cf29662 100644 --- a/go.sum +++ b/go.sum @@ -1,9 +1,9 @@ cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= -dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s= -dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= -github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1 h1:EKPd1INOIyr5hWOWhvpmQpY6tKjeG0hT1s3AMC/9fic= -github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1/go.mod h1:VzwV+t+dZ9j/H867F1M2ziD+yLHtB46oM35FxxMJ4d0= +dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8= +dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1 h1:jHb/wfvRikGdxMXYV3QG/SzUOPYN9KEUUuC0Yd0/vC0= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1/go.mod h1:pzBXCYn05zvYIrwLgtK8Ap8QcjRg+0i76tMQdWN6wOk= github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= @@ -46,8 +46,8 @@ github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.8.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.8.1/go.mod h1:Ng3urmn6dYe8gnbCMoHHVl5APYz2txho3koEkV2o2HA= github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4 h1:jWQK1GI+LeGGUKBADtcH2rRqPxYB1Ljwms5gFA2LqrM= github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4/go.mod h1:8mwH4klAm9DUgR2EEHyEEAQlRDvLPyg5fQry3y+cDew= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= @@ -59,6 +59,8 @@ github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1 github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe3tPhs= github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0= +github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= +github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/apex/log v1.9.0 h1:FHtw/xuaM8AgmvDDTI9fiwoAL25Sq2cxojnZICUU8l0= @@ -69,6 +71,42 @@ github.com/aphistic/sweet v0.2.0/go.mod h1:fWDlIh/isSE9n6EPsRmC0det+whmX6dJid3st github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/aws/aws-sdk-go v1.20.6/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= +github.com/aws/aws-sdk-go-v2 v1.41.7 h1:DWpAJt66FmnnaRIOT/8ASTucrvuDPZASqhhLey6tLY8= +github.com/aws/aws-sdk-go-v2 v1.41.7/go.mod h1:4LAfZOPHNVNQEckOACQx60Y8pSRjIkNZQz1w92xpMJc= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10 h1:gx1AwW1Iyk9Z9dD9F4akX5gnN3QZwUB20GGKH/I+Rho= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.10/go.mod h1:qqY157uZoqm5OXq/amuaBJyC9hgBCBQnsaWnPe905GY= +github.com/aws/aws-sdk-go-v2/config v1.32.17 h1:FpL4/758/diKwqbytU0prpuiu60fgXKUWCpDJtApclU= +github.com/aws/aws-sdk-go-v2/config v1.32.17/go.mod h1:OXqUMzgXytfoF9JaKkhrOYsyh72t9G+MJH8mMRaexOE= +github.com/aws/aws-sdk-go-v2/credentials v1.19.16 h1:r3RJBuU7X9ibt8RHbMjWE6y60QbKBiII6wSrXnapxSU= +github.com/aws/aws-sdk-go-v2/credentials v1.19.16/go.mod h1:6cx7zqDENJDbBIIWX6P8s0h6hqHC8Avbjh9Dseo27ug= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23 h1:UuSfcORqNSz/ey3VPRS8TcVH2Ikf0/sC+Hdj400QI6U= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.23/go.mod h1:+G/OSGiOFnSOkYloKj/9M35s74LgVAdJBSD5lsFfqKg= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23 h1:GpT/TrnBYuE5gan2cZbTtvP+JlHsutdmlV2YfEyNde0= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.23/go.mod h1:xYWD6BS9ywC5bS3sz9Xh04whO/hzK2plt2Zkyrp4JuA= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23 h1:bpd8vxhlQi2r1hiueOw02f/duEPTMK59Q4QMAoTTtTo= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.23/go.mod h1:15DfR2nw+CRHIk0tqNyifu3G1YdAOy68RftkhMDDwYk= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24 h1:OQqn11BtaYv1WLUowvcA30MpzIu8Ti4pcLPIIyoKZrA= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.24/go.mod h1:X5ZJyfwVrWA96GzPmUCWFQaEARPR7gCrpq2E92PJwAE= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9 h1:FLudkZLt5ci0ozzgkVo8BJGwvqNaZbTWb3UcucAateA= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.9/go.mod h1:w7wZ/s9qK7c8g4al+UyoF1Sp/Z45UwMGcqIzLWVQHWk= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15 h1:ieLCO1JxUWuxTZ1cRd0GAaeX7O6cIxnwk7tc1LsQhC4= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.15/go.mod h1:e3IzZvQ3kAWNykvE0Tr0RDZCMFInMvhku3qNpcIQXhM= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23 h1:pbrxO/kuIwgEsOPLkaHu0O+m4fNgLU8B3vxQ+72jTPw= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.23/go.mod h1:/CMNUqoj46HpS3MNRDEDIwcgEnrtZlKRaHNaHxIFpNA= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23 h1:03xatSQO4+AM1lTAbnRg5OK528EUg744nW7F73U8DKw= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.23/go.mod h1:M8l3mwgx5ToK7wot2sBBce/ojzgnPzZXUV445gTSyE8= +github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0 h1:etqBTKY581iwLL/H/S2sVgk3C9lAsTJFeXWFDsDcWOU= +github.com/aws/aws-sdk-go-v2/service/s3 v1.101.0/go.mod h1:L2dcoOgS2VSgbPLvpak2NyUPsO1TBN7M45Z4H7DlRc4= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.11 h1:TdJ+HdzOBhU8+iVAOGUTU63VXopcumCOF1paFulHWZc= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.11/go.mod h1:R82ZRExE/nheo0N+T8zHPcLRTcH8MGsnR3BiVGX0TwI= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.17 h1:7byT8HUWrgoRp6sXjxtZwgOKfhss5fW6SkLBtqzgRoE= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.17/go.mod h1:xNWknVi4Ezm1vg1QsB/5EWpAJURq22uqd38U8qKvOJc= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21 h1:+1Kl1zx6bWi4X7cKi3VYh29h8BvsCoHQEQ6ST9X8w7w= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.21/go.mod h1:4vIRDq+CJB2xFAXZ+YgGUTiEft7oAQlhIs71xcSeuVg= +github.com/aws/aws-sdk-go-v2/service/sts v1.42.1 h1:F/M5Y9I3nwr2IEpshZgh1GeHpOItExNM9L1euNuh/fk= +github.com/aws/aws-sdk-go-v2/service/sts v1.42.1/go.mod h1:mTNxImtovCOEEuD65mKW7DCsL+2gjEH+RPEAexAzAio= +github.com/aws/smithy-go v1.25.1 h1:J8ERsGSU7d+aCmdQur5Txg6bVoYelvQJgtZehD12GkI= +github.com/aws/smithy-go v1.25.1/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc= github.com/aybabtme/rgbterm v0.0.0-20170906152045-cc83f3b3ce59/go.mod h1:q/89r3U2H7sSsE2t6Kca0lfwTK8JdoNGS/yzM/4iH5I= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= @@ -84,29 +122,41 @@ github.com/cilium/ebpf v0.21.0 h1:4dpx1J/B/1apeTmWBH5BkVLayHTkFrMovVPnHEk+l3k= github.com/cilium/ebpf v0.21.0/go.mod h1:1kHKv6Kvh5a6TePP5vvvoMa1bclRyzUXELSs272fmIQ= github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g= github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg= +github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= +github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= +github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= +github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A= github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw= github.com/coreos/go-iptables v0.8.0 h1:MPc2P89IhuVpLI7ETL/2tx3XZ61VeICZjYqDEgNsPRc= github.com/coreos/go-iptables v0.8.0/go.mod h1:Qe8Bv2Xik5FyTXwgIbLAnv2sWSBmvWdFETJConOQ//Q= +github.com/cpuguy83/dockercfg v0.3.2 h1:DlJTyZGBDlXqUZ2Dk2Q3xHs/FtnooJJVaad2S9GKorA= +github.com/cpuguy83/dockercfg v0.3.2/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= -github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= +github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= +github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw= github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= +github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= +github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94= +github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU= +github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= @@ -128,12 +178,15 @@ github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj2 github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= @@ -166,6 +219,7 @@ github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7O github.com/google/go-cmdtest v0.4.1-0.20220921163831-55ab3332a786 h1:rcv+Ippz6RAtvaGgKxc+8FQIpxHgsF+HBzPyYL2cyVU= github.com/google/go-cmdtest v0.4.1-0.20220921163831-55ab3332a786/go.mod h1:apVn/GCasLZUVpAJ6oWAuyP7Ne7CEsQbTnc0plM3m+o= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-configfs-tsm v0.3.3-0.20240919001351-b4b5b84fdcbc h1:SG12DWUUM5igxm+//YX5Yq4vhdoRnOG9HkCodkOn+YU= @@ -220,8 +274,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU= github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k= -github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= -github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= +github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU= github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= @@ -239,6 +293,10 @@ github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ= github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= +github.com/magiconair/properties v1.8.10 h1:s31yESBquKXCV9a/ScB3ESkOjUYYv+X0rg8SYxI99mE= +github.com/magiconair/properties v1.8.10/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= @@ -266,14 +324,26 @@ github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa1 github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= +github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= +github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= +github.com/moby/go-archive v0.2.0 h1:zg5QDUM2mi0JIM9fdQZWC7U8+2ZfixfTYoHL7rWUcP8= +github.com/moby/go-archive v0.2.0/go.mod h1:mNeivT14o8xU+5q1YnNrkQVpK+dnNe/K6fHqnTg4qPU= +github.com/moby/moby/api v1.54.1 h1:TqVzuJkOLsgLDDwNLmYqACUuTehOHRGKiPhvH8V3Nn4= +github.com/moby/moby/api v1.54.1/go.mod h1:+RQ6wluLwtYaTd1WnPLykIDPekkuyD/ROWQClE83pzs= +github.com/moby/moby/client v0.4.0 h1:S+2XegzHQrrvTCvF6s5HFzcrywWQmuVnhOXe2kiWjIw= +github.com/moby/moby/client v0.4.0/go.mod h1:QWPbvWchQbxBNdaLSpoKpCdf5E+WxFAgNHogCWDoa7g= +github.com/moby/patternmatcher v0.6.1 h1:qlhtafmr6kgMIJjKJMDmMWq7WLkKIo23hsrpR3x084U= +github.com/moby/patternmatcher v0.6.1/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y= github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= +github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= +github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko= github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs= github.com/moby/sys/user v0.4.0/go.mod h1:bG+tYYYJgaMtRKgEmuueC0hJEAZWwtIbZTB+85uoHjs= github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= -github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= -github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= +github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= +github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -331,6 +401,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= @@ -354,10 +426,12 @@ github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= +github.com/shirou/gopsutil/v4 v4.26.3 h1:2ESdQt90yU3oXF/CdOlRCJxrP+Am1aBYubTMTfxJ1qc= +github.com/shirou/gopsutil/v4 v4.26.3/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ= github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= +github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= github.com/smartystreets/assertions v1.0.0/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM= github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM= github.com/smartystreets/gunit v1.0.0/go.mod h1:qwPWnhz6pn0NnRBP++URONOVyNkPyr4SauJk4cUOwJs= @@ -375,8 +449,8 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= -github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/objx v0.5.3 h1:jmXUvGomnU1o3W/V5h2VEradbpJDwGrzugQQvL0POH4= +github.com/stretchr/objx v0.5.3/go.mod h1:rDQraq+vQZU7Fde9LOZLr8Tax6zZvy4kuNKF+QYS+U0= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= @@ -385,6 +459,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/testcontainers/testcontainers-go v0.42.0 h1:He3IhTzTZOygSXLJPMX7n44XtK+qhjat1nI9cneBbUY= +github.com/testcontainers/testcontainers-go v0.42.0/go.mod h1:vZjdY1YmUA1qEForxOIOazfsrdyORJAbhi0bp8plN30= github.com/tj/assert v0.0.0-20171129193455-018094318fb0/go.mod h1:mZ9/Rh9oLWpLLDRpvE+3b7gP/C2YyLFYxNmcLnPTMe0= github.com/tj/assert v0.0.3 h1:Df/BlaZ20mq6kuai7f5z2TvPFiwC3xaWJSDQNiIS3Rk= github.com/tj/assert v0.0.3/go.mod h1:Ne6X72Q+TB1AteidzQncjw9PabbMp4PBMZ1k+vd1Pvk= @@ -392,6 +468,10 @@ github.com/tj/go-buffer v1.1.0/go.mod h1:iyiJpfFcR2B9sXu7KvjbT9fpM4mOelRSDTbntVj github.com/tj/go-elastic v0.0.0-20171221160941-36157cbbebc2/go.mod h1:WjeM0Oo1eNAjXGDx2yma7uG2XoyRZTq1uv3M/o7imD0= github.com/tj/go-kinesis v0.0.0-20171128231115-08b17f58cb1b/go.mod h1:/yhzCV0xPfx6jb1bBgRFjl5lytqVqZXEaeqWP8lTEao= github.com/tj/go-spin v1.1.0/go.mod h1:Mg1mzmePZm4dva8Qz60H2lHwmJ2loum4VIrLgVnKwh4= +github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA= +github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI= +github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw= +github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 h1:tHNk7XK9GkmKUR6Gh8gVBKXc2MVSZ4G/NnWLtzw4gNA= github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923/go.mod h1:eLL9Nub3yfAho7qB0MzZizFhTU2QkLeoVsWdHtDW264= github.com/urfave/cli v1.22.12 h1:igJgVw1JdKH+trcLWLeLwZjU9fEfPesQ+9/e4MQ44S8= @@ -408,6 +488,8 @@ github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ= github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= @@ -460,9 +542,10 @@ golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220622161953-175b2fd9d664/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -477,8 +560,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= -golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= -golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= +golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c= golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= @@ -526,6 +609,8 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.0-20200605160147-a5ece683394c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q= +gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA= k8s.io/api v0.35.4 h1:P7nFYKl5vo9AGUp1Z+Pmd3p2tA7bX2wbFWCvDeRv988= k8s.io/api v0.35.4/go.mod h1:yl4lqySWOgYJJf9RERXKUwE9g2y+CkuwG+xmcOK8wXU= k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= @@ -582,6 +667,8 @@ modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc= oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o= +pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk= +pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= diff --git a/hack/cmd/orcaseed/main.go b/hack/cmd/orcaseed/main.go new file mode 100644 index 00000000..23e83cac --- /dev/null +++ b/hack/cmd/orcaseed/main.go @@ -0,0 +1,10 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package main + +import "github.com/Azure/unbounded/hack/cmd/orcaseed/orcaseed" + +func main() { + orcaseed.Run() +} diff --git a/hack/cmd/orcaseed/orcaseed/client.go b/hack/cmd/orcaseed/orcaseed/client.go new file mode 100644 index 00000000..473233b7 --- /dev/null +++ b/hack/cmd/orcaseed/orcaseed/client.go @@ -0,0 +1,102 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package orcaseed + +import ( + "context" + "fmt" + "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/bloberror" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container" +) + +// azuriteWellKnownDevKey is the documented well-known shared key for +// Azurite's default account ('devstoreaccount1'). It is a public +// constant baked into Azurite, not a secret. Documented at +// https://learn.microsoft.com/azure/storage/common/storage-use-azurite. +const azuriteWellKnownDevKey = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" + +// globalFlags carries the connection-shape flags that every subcommand +// honours. The defaults target the in-cluster Azurite emulator exposed +// to the host via the dev harness's NodePort 30100. +type globalFlags struct { + endpoint string + account string + accountKey string + containerName string + ensureContainer bool +} + +func defaultGlobalFlags() *globalFlags { + return &globalFlags{ + endpoint: "http://localhost:30100/devstoreaccount1/", + account: "devstoreaccount1", + accountKey: azuriteWellKnownDevKey, + containerName: "orca-test", + ensureContainer: true, + } +} + +// newClients constructs the azblob service + container clients from +// the global flags, applies the ensure-container behaviour if +// requested, and returns the container client ready for blob +// operations. +func (g *globalFlags) newClients(ctx context.Context) (*azblob.Client, *container.Client, error) { + if g.endpoint == "" { + return nil, nil, fmt.Errorf("--endpoint is required") + } + + if g.account == "" { + return nil, nil, fmt.Errorf("--account is required") + } + + if g.accountKey == "" { + return nil, nil, fmt.Errorf("--account-key is required") + } + + if g.containerName == "" { + return nil, nil, fmt.Errorf("--container is required") + } + + cred, err := azblob.NewSharedKeyCredential(g.account, g.accountKey) + if err != nil { + return nil, nil, fmt.Errorf("shared-key credential: %w", err) + } + // Trim a trailing slash so containerURL concatenation produces + // the expected single-slash boundary. + endpoint := strings.TrimRight(g.endpoint, "/") + + svc, err := azblob.NewClientWithSharedKeyCredential(endpoint, cred, nil) + if err != nil { + return nil, nil, fmt.Errorf("azblob client: %w", err) + } + + cc := svc.ServiceClient().NewContainerClient(g.containerName) + + if g.ensureContainer { + if err := ensureContainer(ctx, cc); err != nil { + return nil, nil, fmt.Errorf("ensure container %q: %w", g.containerName, err) + } + } + + return svc, cc, nil +} + +// ensureContainer creates the container if it does not exist. +// ContainerAlreadyExists is treated as success so callers can invoke +// this idempotently on every run. +func ensureContainer(ctx context.Context, cc *container.Client) error { + _, err := cc.Create(ctx, nil) + if err == nil { + return nil + } + + if bloberror.HasCode(err, bloberror.ContainerAlreadyExists) { + return nil + } + + return err +} diff --git a/hack/cmd/orcaseed/orcaseed/delete.go b/hack/cmd/orcaseed/orcaseed/delete.go new file mode 100644 index 00000000..47406b57 --- /dev/null +++ b/hack/cmd/orcaseed/orcaseed/delete.go @@ -0,0 +1,116 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package orcaseed + +import ( + "bufio" + "context" + "errors" + "fmt" + "io" + "os" + "strings" + + "github.com/spf13/cobra" + + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container" +) + +type deleteOpts struct { + prefix string + yes bool +} + +func newDeleteCmd(g *globalFlags) *cobra.Command { + o := &deleteOpts{} + + cmd := &cobra.Command{ + Use: "delete", + Short: "Delete blobs from the container", + Long: `Delete removes every blob in the container whose name begins with +--prefix (default: all blobs). Without --yes the command lists the +matching set and prompts for confirmation on stdin.`, + RunE: func(cmd *cobra.Command, _ []string) error { + return runDelete(cmd.Context(), g, o) + }, + } + + cmd.Flags().StringVar(&o.prefix, "prefix", "", + "only delete blobs whose name begins with this prefix (empty = all)") + cmd.Flags().BoolVar(&o.yes, "yes", false, + "skip the interactive confirmation prompt") + + return cmd +} + +func runDelete(ctx context.Context, g *globalFlags, o *deleteOpts) error { + _, cc, err := g.newClients(ctx) + if err != nil { + return err + } + + opts := &container.ListBlobsFlatOptions{} + if o.prefix != "" { + opts.Prefix = &o.prefix + } + + var names []string + + pager := cc.NewListBlobsFlatPager(opts) + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + return fmt.Errorf("list: %w", err) + } + + for _, item := range page.Segment.BlobItems { + if item.Name != nil { + names = append(names, *item.Name) + } + } + } + + if len(names) == 0 { + fmt.Fprintf(os.Stderr, "no matching blobs in container %q\n", g.containerName) + return nil + } + + if !o.yes { + fmt.Fprintf(os.Stderr, "about to delete %d blob(s) from container %q:\n", + len(names), g.containerName) + + for _, n := range names { + fmt.Fprintf(os.Stderr, " %s\n", n) + } + + fmt.Fprint(os.Stderr, "proceed? [y/N]: ") + + r := bufio.NewReader(os.Stdin) + + line, err := r.ReadString('\n') + if err != nil { + if errors.Is(err, io.EOF) { + return fmt.Errorf("delete confirmation: stdin closed without input; pass --yes to skip the prompt in non-interactive contexts") + } + + return fmt.Errorf("read confirmation: %w", err) + } + + if strings.ToLower(strings.TrimSpace(line)) != "y" { + fmt.Fprintln(os.Stderr, "aborted.") + return nil + } + } + + for _, n := range names { + bc := cc.NewBlobClient(n) + if _, err := bc.Delete(ctx, nil); err != nil { + return fmt.Errorf("delete %s: %w", n, err) + } + } + + fmt.Fprintf(os.Stderr, "deleted %d blobs from container %q\n", len(names), g.containerName) + + return nil +} diff --git a/hack/cmd/orcaseed/orcaseed/generate.go b/hack/cmd/orcaseed/orcaseed/generate.go new file mode 100644 index 00000000..ae388832 --- /dev/null +++ b/hack/cmd/orcaseed/orcaseed/generate.go @@ -0,0 +1,229 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package orcaseed + +import ( + "context" + "crypto/rand" + "fmt" + "io" + mathrand "math/rand" + "os" + "sync/atomic" + "time" + + "github.com/spf13/cobra" + "golang.org/x/sync/errgroup" + + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blockblob" +) + +// generateOpts captures the per-command flags for the generate +// subcommand. Defaults are conservative (1 MiB x 1 blob) so an +// accidental invocation with no flags is harmless. +type generateOpts struct { + sizeStr string + count int + prefix string + seed int64 + concurrency int + force bool +} + +const ( + // perBlobMax is the per-blob ceiling. Larger blobs require + // --force to acknowledge. Picked at 1 GiB to match the operator's + // stated cap and keep accidental "1TiB" typos from filling the + // emulator's emptyDir. + perBlobMax int64 = 1024 * 1024 * 1024 + // totalWarn is the cumulative-bytes threshold above which the + // command logs a warning before proceeding. Sized to match + // perBlobMax for symmetry. + totalWarn int64 = 1024 * 1024 * 1024 +) + +func newGenerateCmd(g *globalFlags) *cobra.Command { + o := &generateOpts{ + sizeStr: "1MiB", + count: 1, + prefix: "synth-", + concurrency: 4, + } + + cmd := &cobra.Command{ + Use: "generate", + Short: "Generate N synthetic blobs of size S and upload them", + Long: `Generate creates --count blobs of --size random bytes each, named +0, 1, ... and uploads them to the configured +container. Use --seed to make the byte stream reproducible across +runs (useful when comparing cache behaviour between experiments).`, + RunE: func(cmd *cobra.Command, _ []string) error { + return runGenerate(cmd.Context(), g, o) + }, + } + + cmd.Flags().StringVar(&o.sizeStr, "size", o.sizeStr, + "per-blob size (e.g. 1MiB, 100MB, 1GiB)") + cmd.Flags().IntVar(&o.count, "count", o.count, + "number of blobs to generate") + cmd.Flags().StringVar(&o.prefix, "prefix", o.prefix, + "blob name prefix; blobs are named ") + cmd.Flags().Int64Var(&o.seed, "seed", o.seed, + "PRNG seed for deterministic content; 0 = use crypto/rand") + cmd.Flags().IntVar(&o.concurrency, "concurrency", o.concurrency, + "number of parallel uploads") + cmd.Flags().BoolVar(&o.force, "force", o.force, + "allow per-blob size > 1 GiB") + + return cmd +} + +func runGenerate(ctx context.Context, g *globalFlags, o *generateOpts) error { + if o.count < 1 { + return fmt.Errorf("--count must be >= 1") + } + + if o.concurrency < 1 { + o.concurrency = 1 + } + + size, err := parseSize(o.sizeStr) + if err != nil { + return fmt.Errorf("--size: %w", err) + } + + if size < 0 { + return fmt.Errorf("--size must be non-negative") + } + + if size > perBlobMax && !o.force { + return fmt.Errorf("--size %s exceeds per-blob ceiling %s; pass --force to override", + formatSize(size), formatSize(perBlobMax)) + } + + total := size * int64(o.count) + if total > totalWarn { + fmt.Fprintf(os.Stderr, "warning: cumulative upload is %s (size %s x count %d); proceeding\n", + formatSize(total), formatSize(size), o.count) + } + + _, cc, err := g.newClients(ctx) + if err != nil { + return err + } + + fmt.Fprintf(os.Stderr, "generating %d blobs of %s (total %s) into container %q at %s\n", + o.count, formatSize(size), formatSize(total), g.containerName, g.endpoint) + + var ( + uploaded atomic.Int64 + bytes atomic.Int64 + ) + + progressDone := make(chan struct{}) + + go func() { + defer close(progressDone) + + t := time.NewTicker(500 * time.Millisecond) + defer t.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-t.C: + done := uploaded.Load() + if done >= int64(o.count) { + return + } + + fmt.Fprintf(os.Stderr, " ... uploaded %d / %d (%s)\n", + done, o.count, formatSize(bytes.Load())) + } + } + }() + + g2, gctx := errgroup.WithContext(ctx) + g2.SetLimit(o.concurrency) + + for i := 0; i < o.count; i++ { + i := i + + g2.Go(func() error { + name := fmt.Sprintf("%s%d", o.prefix, i) + + body := newRandomReader(size, o.seed, int64(i)) + + bc := cc.NewBlockBlobClient(name) + if _, err := bc.UploadStream(gctx, body, &blockblob.UploadStreamOptions{}); err != nil { + return fmt.Errorf("upload %s: %w", name, err) + } + + uploaded.Add(1) + bytes.Add(size) + + return nil + }) + } + + if err := g2.Wait(); err != nil { + return err + } + + <-progressDone + + fmt.Fprintf(os.Stderr, "done: %d blobs, %s total\n", o.count, formatSize(bytes.Load())) + + return nil +} + +// newRandomReader returns an io.Reader producing exactly n bytes. +// When userSeed == 0 the bytes come from crypto/rand (non- +// deterministic, intended for typical seed-data workloads). When +// userSeed != 0 the per-blob byte stream is derived from +// math/rand.NewSource(userSeed + blobIndex), giving each blob its +// own independent deterministic stream. The per-blob derivation is +// what makes determinism survive --concurrency > 1: two invocations +// of `orcaseed generate --seed 42 --count N --concurrency K` +// produce byte-identical blobs regardless of upload-completion +// ordering, because each blob's content is a pure function of +// (userSeed, blobIndex). +func newRandomReader(n, userSeed, blobIndex int64) io.Reader { + if userSeed == 0 { + return io.LimitReader(rand.Reader, n) + } + + src := mathrand.NewSource(userSeed + blobIndex) + + return &seededReader{rng: mathrand.New(src), remaining: n} //nolint:gosec // dev tool, deterministic-by-design +} + +// seededReader produces exactly remaining bytes from a per-blob +// math/rand source. The source is not shared, so no mutex is +// required and reads do not block other goroutines. +type seededReader struct { + rng *mathrand.Rand + remaining int64 +} + +func (r *seededReader) Read(p []byte) (int, error) { + if r.remaining <= 0 { + return 0, io.EOF + } + + want := int64(len(p)) + if want > r.remaining { + want = r.remaining + } + + n, _ := r.rng.Read(p[:want]) //nolint:errcheck // math/rand never errors + + r.remaining -= int64(n) + if r.remaining == 0 { + return n, io.EOF + } + + return n, nil +} diff --git a/hack/cmd/orcaseed/orcaseed/list.go b/hack/cmd/orcaseed/orcaseed/list.go new file mode 100644 index 00000000..1e5ba309 --- /dev/null +++ b/hack/cmd/orcaseed/orcaseed/list.go @@ -0,0 +1,84 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package orcaseed + +import ( + "context" + "fmt" + "os" + + "github.com/spf13/cobra" + + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container" +) + +type listOpts struct { + prefix string +} + +func newListCmd(g *globalFlags) *cobra.Command { + o := &listOpts{} + + cmd := &cobra.Command{ + Use: "list", + Short: "List blobs currently in the container", + Long: `List prints "\t" for each blob in the configured +container, optionally filtered by --prefix.`, + RunE: func(cmd *cobra.Command, _ []string) error { + return runList(cmd.Context(), g, o) + }, + } + + cmd.Flags().StringVar(&o.prefix, "prefix", "", + "only list blobs whose name begins with this prefix") + + return cmd +} + +func runList(ctx context.Context, g *globalFlags, o *listOpts) error { + _, cc, err := g.newClients(ctx) + if err != nil { + return err + } + + opts := &container.ListBlobsFlatOptions{} + if o.prefix != "" { + opts.Prefix = &o.prefix + } + + pager := cc.NewListBlobsFlatPager(opts) + + var ( + count int + total int64 + ) + + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + return fmt.Errorf("list: %w", err) + } + + for _, item := range page.Segment.BlobItems { + name := "" + if item.Name != nil { + name = *item.Name + } + + size := int64(0) + if item.Properties != nil && item.Properties.ContentLength != nil { + size = *item.Properties.ContentLength + } + + fmt.Printf("%-12s\t%s\n", formatSize(size), name) + + count++ + total += size + } + } + + fmt.Fprintf(os.Stderr, "(%d blobs, %s total)\n", count, formatSize(total)) + + return nil +} diff --git a/hack/cmd/orcaseed/orcaseed/orcaseed.go b/hack/cmd/orcaseed/orcaseed/orcaseed.go new file mode 100644 index 00000000..d43ee29f --- /dev/null +++ b/hack/cmd/orcaseed/orcaseed/orcaseed.go @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package orcaseed implements the `orcaseed` developer tool used by +// the Orca dev harness to populate the in-cluster Azurite origin +// container with synthetic or operator-supplied content. Four +// subcommands: +// +// generate - synthesise N blobs of size S each (random bytes; +// optionally seeded for reproducibility). +// upload - upload a single file from disk. +// list - print the blobs currently in the container. +// delete - remove blobs (optional --prefix filter). +// +// All subcommands share connection-shape flags (--endpoint, +// --account, --account-key, --container) defaulting to the dev +// harness's NodePort-exposed Azurite at localhost:30100. The +// well-known Azurite dev key is the default --account-key value; +// it is a public Microsoft-documented constant, not a secret. +package orcaseed + +import ( + "fmt" + "os" + + "github.com/spf13/cobra" +) + +// Run is the entrypoint invoked by cmd/orcaseed/main.go. Wires the +// cobra command tree, parses flags, dispatches to the chosen +// subcommand. On error prints to stderr and exits non-zero. +func Run() { + g := defaultGlobalFlags() + + root := &cobra.Command{ + Use: "orcaseed", + Short: "Populate the Orca dev-harness origin container", + SilenceUsage: true, + SilenceErrors: false, + } + + root.PersistentFlags().StringVar(&g.endpoint, "endpoint", g.endpoint, + "Azure Blob endpoint URL (path-style, account-included)") + root.PersistentFlags().StringVar(&g.account, "account", g.account, + "Storage account name") + root.PersistentFlags().StringVar(&g.accountKey, "account-key", g.accountKey, + "Shared key for the account (default: well-known Azurite dev key)") + root.PersistentFlags().StringVar(&g.containerName, "container", g.containerName, + "Container to operate against") + root.PersistentFlags().BoolVar(&g.ensureContainer, "ensure-container", g.ensureContainer, + "Create the container if it does not already exist") + + root.AddCommand(newGenerateCmd(g)) + root.AddCommand(newUploadCmd(g)) + root.AddCommand(newListCmd(g)) + root.AddCommand(newDeleteCmd(g)) + + if err := root.Execute(); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} diff --git a/hack/cmd/orcaseed/orcaseed/orcaseed_test.go b/hack/cmd/orcaseed/orcaseed/orcaseed_test.go new file mode 100644 index 00000000..4ff33766 --- /dev/null +++ b/hack/cmd/orcaseed/orcaseed/orcaseed_test.go @@ -0,0 +1,282 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package orcaseed + +import ( + "context" + "encoding/base64" + "io" + "net/http" + "net/http/httptest" + "net/url" + "strings" + "sync/atomic" + "testing" +) + +// TestParseSize covers every accepted suffix and the error paths. +func TestParseSize(t *testing.T) { + t.Parallel() + + tests := []struct { + in string + want int64 + wantErr bool + }{ + {"1024", 1024, false}, + {"0", 0, false}, + {"1B", 1, false}, + {"1KB", 1000, false}, + {"1KiB", 1024, false}, + {"10MB", 10_000_000, false}, + {"10MiB", 10 * 1024 * 1024, false}, + {"1GB", 1_000_000_000, false}, + {"1GiB", 1024 * 1024 * 1024, false}, + {"1TB", 1_000_000_000_000, false}, + {"1TiB", 1024 * 1024 * 1024 * 1024, false}, + {"1.5GB", 1_500_000_000, false}, + {" 10MiB ", 10 * 1024 * 1024, false}, + {"10mib", 10 * 1024 * 1024, false}, + {"", 0, true}, + {"abc", 0, true}, + {"1XB", 0, true}, + {"-5MB", 0, true}, + } + + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + got, err := parseSize(tt.in) + if tt.wantErr { + if err == nil { + t.Errorf("parseSize(%q) = %d, want error", tt.in, got) + } + + return + } + + if err != nil { + t.Errorf("parseSize(%q) unexpected err: %v", tt.in, err) + return + } + + if got != tt.want { + t.Errorf("parseSize(%q) = %d, want %d", tt.in, got, tt.want) + } + }) + } +} + +// TestFormatSize spot-checks the human-readable rendering at the +// boundaries between units. +func TestFormatSize(t *testing.T) { + t.Parallel() + + tests := []struct { + in int64 + want string + }{ + {0, "0 B"}, + {512, "512 B"}, + {1024, "1.00 KiB"}, + {2048, "2.00 KiB"}, + {1024 * 1024, "1.00 MiB"}, + {10 * 1024 * 1024, "10.00 MiB"}, + {1024 * 1024 * 1024, "1.00 GiB"}, + } + + for _, tt := range tests { + got := formatSize(tt.in) + if got != tt.want { + t.Errorf("formatSize(%d) = %q, want %q", tt.in, got, tt.want) + } + } +} + +// TestGenerate_SeededDeterministic_Concurrent verifies that two +// generate runs with the same --seed produce byte-identical bodies +// even under concurrency > 1. The previous implementation used a +// shared math/rand source serialised through a mutex; bytes flowed +// to whichever goroutine acquired the lock first, so the same +// invocation could produce different per-blob bytes between runs +// based on goroutine-scheduling order. The fixed implementation +// derives each blob's stream from (seed + blobIndex), so each blob +// is a pure function of its index and seed regardless of +// completion ordering. +// +// Regression for C-6. +func TestGenerate_SeededDeterministic_Concurrent(t *testing.T) { + t.Parallel() + + bodiesA := startFakeAzurite(t) + defer bodiesA.close() + + bodiesB := startFakeAzurite(t) + defer bodiesB.close() + + g := defaultGlobalFlags() + g.endpoint = bodiesA.url + g.account = "devstoreaccount1" + g.accountKey = base64.StdEncoding.EncodeToString([]byte("test-shared-key-placeholder--32b")) + g.containerName = "ctr" + + o := &generateOpts{ + sizeStr: "4KiB", + count: 4, + prefix: "synth-", + seed: 42, + concurrency: 4, // deliberate: prove determinism survives parallel uploads + } + + if err := runGenerate(context.Background(), g, o); err != nil { + t.Fatalf("first runGenerate: %v", err) + } + + g.endpoint = bodiesB.url + + if err := runGenerate(context.Background(), g, o); err != nil { + t.Fatalf("second runGenerate: %v", err) + } + + for _, name := range []string{"synth-0", "synth-1", "synth-2", "synth-3"} { + a := bodiesA.get(name) + b := bodiesB.get(name) + + if len(a) == 0 { + t.Errorf("blob %q missing from first run", name) + continue + } + + if len(a) != len(b) { + t.Errorf("blob %q length differs across runs: %d vs %d", name, len(a), len(b)) + continue + } + + if string(a) != string(b) { + t.Errorf("blob %q bytes differ across two seeded runs (concurrency=%d)", + name, o.concurrency) + } + } +} + +// TestGenerate_SeededDifferentBlobsHaveDifferentContent verifies the +// per-blob seeding produces distinct streams (so two blobs in the +// same run are not byte-identical). +func TestGenerate_SeededDifferentBlobsHaveDifferentContent(t *testing.T) { + t.Parallel() + + bodies := startFakeAzurite(t) + defer bodies.close() + + g := defaultGlobalFlags() + g.endpoint = bodies.url + g.account = "devstoreaccount1" + g.accountKey = base64.StdEncoding.EncodeToString([]byte("test-shared-key-placeholder--32b")) + g.containerName = "ctr" + + o := &generateOpts{ + sizeStr: "4KiB", + count: 2, + prefix: "synth-", + seed: 99, + concurrency: 2, + } + + if err := runGenerate(context.Background(), g, o); err != nil { + t.Fatalf("runGenerate: %v", err) + } + + a := bodies.get("synth-0") + b := bodies.get("synth-1") + + if len(a) == 0 || len(b) == 0 { + t.Fatalf("blobs missing: synth-0=%d synth-1=%d", len(a), len(b)) + } + + if string(a) == string(b) { + t.Errorf("synth-0 and synth-1 have identical content; per-blob seeding broken") + } +} + +// fakeAzurite is a minimal httptest-backed server that: +// - accepts container Create (PUT ?restype=container) with 201; +// - accepts block-blob PUT at /// with 201; +// - records received bodies indexed by blob name; +// - rejects everything else with 400 so test failures are loud. +type fakeAzurite struct { + srv *httptest.Server + url string + mu atomic.Pointer[map[string][]byte] + requests atomic.Int64 +} + +func startFakeAzurite(t *testing.T) *fakeAzurite { + t.Helper() + + f := &fakeAzurite{} + bodies := make(map[string][]byte) + f.mu.Store(&bodies) + + mux := http.NewServeMux() + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + f.requests.Add(1) + // path: //[/] + // We don't validate the SAS / shared-key signature; the SDK + // signs every request and we trust the format. + path := strings.TrimPrefix(r.URL.Path, "/") + + parts := strings.SplitN(path, "/", 3) + if len(parts) < 2 { + http.Error(w, "bad path", http.StatusBadRequest) + return + } + // Container create: PUT //?restype=container + if r.Method == http.MethodPut && len(parts) == 2 && r.URL.Query().Get("restype") == "container" { + w.WriteHeader(http.StatusCreated) + return + } + + if r.Method == http.MethodPut && len(parts) == 3 { + body, _ := io.ReadAll(r.Body) //nolint:errcheck // best-effort test reader + _ = r.Body.Close() //nolint:errcheck // best-effort + + cur := *f.mu.Load() + next := make(map[string][]byte, len(cur)+1) + + for k, v := range cur { + next[k] = v + } + + next[parts[2]] = body + f.mu.Store(&next) + + w.Header().Set("ETag", "\"fake-etag\"") + w.Header().Set("Last-Modified", "Thu, 01 Jan 1970 00:00:00 GMT") + w.WriteHeader(http.StatusCreated) + + return + } + + http.Error(w, "unexpected request: "+r.Method+" "+r.URL.String(), http.StatusBadRequest) + }) + + f.srv = httptest.NewServer(mux) + // Account-suffixed endpoint shape the SDK expects. + f.url = f.srv.URL + "/devstoreaccount1/" + + // Validate the URL parses cleanly. + if _, err := url.Parse(f.url); err != nil { + t.Fatalf("fake azurite endpoint parse: %v", err) + } + + return f +} + +func (f *fakeAzurite) close() { + f.srv.Close() +} + +func (f *fakeAzurite) get(name string) []byte { + cur := *f.mu.Load() + return cur[name] +} diff --git a/hack/cmd/orcaseed/orcaseed/size.go b/hack/cmd/orcaseed/orcaseed/size.go new file mode 100644 index 00000000..4ea835f5 --- /dev/null +++ b/hack/cmd/orcaseed/orcaseed/size.go @@ -0,0 +1,109 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package orcaseed + +import ( + "fmt" + "strconv" + "strings" +) + +// parseSize converts a human-readable size string into a byte count. +// Supports the following suffixes (case-insensitive): B, KB, KiB, MB, +// MiB, GB, GiB, TB, TiB. Decimal suffixes (KB, MB, ...) use base 1000; +// binary suffixes (KiB, MiB, ...) use base 1024. Bare numbers are +// interpreted as bytes. +// +// Examples: +// +// "1024" -> 1024 +// "1KB" -> 1000 +// "1KiB" -> 1024 +// "10MiB" -> 10485760 +// "1.5GB" -> 1500000000 +func parseSize(s string) (int64, error) { + s = strings.TrimSpace(s) + if s == "" { + return 0, fmt.Errorf("empty size string") + } + // Walk forward to find the numeric / suffix split. + i := 0 + for i < len(s) { + c := s[i] + if (c >= '0' && c <= '9') || c == '.' { + i++ + continue + } + + break + } + + if i == 0 { + return 0, fmt.Errorf("size %q has no numeric prefix", s) + } + + numStr := s[:i] + suffix := strings.ToLower(strings.TrimSpace(s[i:])) + + num, err := strconv.ParseFloat(numStr, 64) + if err != nil { + return 0, fmt.Errorf("invalid number %q: %w", numStr, err) + } + + if num < 0 { + return 0, fmt.Errorf("size must be non-negative, got %s", numStr) + } + + var mult int64 + + switch suffix { + case "", "b": + mult = 1 + case "k", "kb": + mult = 1000 + case "ki", "kib": + mult = 1024 + case "m", "mb": + mult = 1000 * 1000 + case "mi", "mib": + mult = 1024 * 1024 + case "g", "gb": + mult = 1000 * 1000 * 1000 + case "gi", "gib": + mult = 1024 * 1024 * 1024 + case "t", "tb": + mult = 1000 * 1000 * 1000 * 1000 + case "ti", "tib": + mult = 1024 * 1024 * 1024 * 1024 + default: + return 0, fmt.Errorf("size %q has unrecognized suffix %q (want B, KB/KiB, MB/MiB, GB/GiB, TB/TiB)", s, suffix) + } + + return int64(num * float64(mult)), nil +} + +// formatSize renders a byte count as a human-friendly string using +// binary suffixes (KiB, MiB, GiB). Used in progress and summary +// output where readability matters more than precision. +func formatSize(n int64) string { + const ( + kib int64 = 1024 + mib int64 = 1024 * kib + gib int64 = 1024 * mib + tib int64 = 1024 * gib + ) + + switch { + case n >= tib: + return fmt.Sprintf("%.2f TiB", float64(n)/float64(tib)) + case n >= gib: + return fmt.Sprintf("%.2f GiB", float64(n)/float64(gib)) + case n >= mib: + return fmt.Sprintf("%.2f MiB", float64(n)/float64(mib)) + case n >= kib: + return fmt.Sprintf("%.2f KiB", float64(n)/float64(kib)) + default: + return fmt.Sprintf("%d B", n) + } +} diff --git a/hack/cmd/orcaseed/orcaseed/upload.go b/hack/cmd/orcaseed/orcaseed/upload.go new file mode 100644 index 00000000..f746a26d --- /dev/null +++ b/hack/cmd/orcaseed/orcaseed/upload.go @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package orcaseed + +import ( + "context" + "fmt" + "os" + "path/filepath" + + "github.com/spf13/cobra" + + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blockblob" +) + +type uploadOpts struct { + file string + name string +} + +func newUploadCmd(g *globalFlags) *cobra.Command { + o := &uploadOpts{} + + cmd := &cobra.Command{ + Use: "upload", + Short: "Upload a single file from disk into the container", + Long: `Upload reads --file from local disk and stores it in the configured +container under --name (default: filepath.Base(--file)). The +upload streams in chunks; very large files don't buffer in memory.`, + RunE: func(cmd *cobra.Command, _ []string) error { + return runUpload(cmd.Context(), g, o) + }, + } + + cmd.Flags().StringVar(&o.file, "file", "", "local file to upload (required)") + cmd.Flags().StringVar(&o.name, "name", "", + "destination blob name (default: basename of --file)") + + return cmd +} + +func runUpload(ctx context.Context, g *globalFlags, o *uploadOpts) error { + if o.file == "" { + return fmt.Errorf("--file is required") + } + + st, err := os.Stat(o.file) + if err != nil { + return fmt.Errorf("stat --file: %w", err) + } + + if st.IsDir() { + return fmt.Errorf("--file %q is a directory; only single files are supported", o.file) + } + + name := o.name + if name == "" { + name = filepath.Base(o.file) + } + + _, cc, err := g.newClients(ctx) + if err != nil { + return err + } + + f, err := os.Open(o.file) + if err != nil { + return fmt.Errorf("open --file: %w", err) + } + + defer f.Close() //nolint:errcheck // upload tool, file close best-effort on success path + + fmt.Fprintf(os.Stderr, "uploading %s (%s) -> %s/%s\n", + o.file, formatSize(st.Size()), g.containerName, name) + + bc := cc.NewBlockBlobClient(name) + if _, err := bc.UploadStream(ctx, f, &blockblob.UploadStreamOptions{}); err != nil { + return fmt.Errorf("upload: %w", err) + } + + fmt.Fprintf(os.Stderr, "done.\n") + + return nil +} diff --git a/hack/cmd/render-manifests/main.go b/hack/cmd/render-manifests/main.go index 475c7129..187676fa 100644 --- a/hack/cmd/render-manifests/main.go +++ b/hack/cmd/render-manifests/main.go @@ -10,19 +10,19 @@ // evaluate to empty strings (text/template's missingkey=zero behaviour for map // data), which lets templates rely on sprig's `default` function to supply // documented fallbacks. +// +// The actual rendering logic lives in the render sub-package so it can be +// invoked programmatically from tests. package main import ( - "bytes" "flag" "fmt" "os" - "path/filepath" "sort" "strings" - "text/template" - "github.com/Masterminds/sprig/v3" + "github.com/Azure/unbounded/hack/cmd/render-manifests/render" ) // setFlags implements flag.Value for repeatable --set key=value arguments. @@ -75,60 +75,11 @@ func main() { exitWithError("--output-dir is required") } - if err := renderTemplates(templatesDir, outputDir, data); err != nil { + if err := render.Render(templatesDir, outputDir, data); err != nil { exitWithError(err.Error()) } } -func renderTemplates(templatesDir, outputDir string, data setFlags) error { - return filepath.WalkDir(templatesDir, func(path string, d os.DirEntry, err error) error { - if err != nil { - return err - } - - if d.IsDir() { - return nil - } - - if !strings.HasSuffix(path, ".yaml.tmpl") { - return nil - } - - relPath, err := filepath.Rel(templatesDir, path) - if err != nil { - return err - } - - outputRelPath := strings.TrimSuffix(relPath, ".tmpl") - outputPath := filepath.Join(outputDir, outputRelPath) - - templateBytes, err := os.ReadFile(path) - if err != nil { - return fmt.Errorf("read template %q: %w", path, err) - } - - tmpl, err := template.New(relPath).Funcs(sprig.TxtFuncMap()).Option("missingkey=zero").Parse(string(templateBytes)) - if err != nil { - return fmt.Errorf("parse template %q: %w", path, err) - } - - if err := os.MkdirAll(filepath.Dir(outputPath), 0o755); err != nil { - return fmt.Errorf("create output dir for %q: %w", outputPath, err) - } - - var rendered bytes.Buffer - if err := tmpl.Execute(&rendered, map[string]string(data)); err != nil { - return fmt.Errorf("execute template %q: %w", path, err) - } - - if err := os.WriteFile(outputPath, rendered.Bytes(), 0o644); err != nil { - return fmt.Errorf("write rendered manifest %q: %w", outputPath, err) - } - - return nil - }) -} - func exitWithError(message string) { fmt.Fprintln(os.Stderr, message) os.Exit(1) diff --git a/hack/cmd/render-manifests/render/render.go b/hack/cmd/render-manifests/render/render.go new file mode 100644 index 00000000..13d3dce5 --- /dev/null +++ b/hack/cmd/render-manifests/render/render.go @@ -0,0 +1,75 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package render implements the manifest template renderer used by +// the render-manifests CLI. Exposed as a package so tests in other +// packages (e.g. internal/orca/manifests) can render the orca +// templates programmatically without shelling out to `go run`. +package render + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + "strings" + "text/template" + + "github.com/Masterminds/sprig/v3" +) + +// Render walks templatesDir for *.yaml.tmpl files, executes each with +// Go's text/template (plus the sprig function library), and writes +// the rendered output under outputDir mirroring the source tree. +// +// Template data is supplied via the data map. Missing keys evaluate +// to empty strings (text/template's missingkey=zero), which lets +// templates rely on sprig's `default` function for fallbacks. +func Render(templatesDir, outputDir string, data map[string]string) error { + return filepath.WalkDir(templatesDir, func(path string, d os.DirEntry, err error) error { + if err != nil { + return err + } + + if d.IsDir() { + return nil + } + + if !strings.HasSuffix(path, ".yaml.tmpl") { + return nil + } + + relPath, err := filepath.Rel(templatesDir, path) + if err != nil { + return err + } + + outputRelPath := strings.TrimSuffix(relPath, ".tmpl") + outputPath := filepath.Join(outputDir, outputRelPath) + + templateBytes, err := os.ReadFile(path) + if err != nil { + return fmt.Errorf("read template %q: %w", path, err) + } + + tmpl, err := template.New(relPath).Funcs(sprig.TxtFuncMap()).Option("missingkey=zero").Parse(string(templateBytes)) + if err != nil { + return fmt.Errorf("parse template %q: %w", path, err) + } + + if err := os.MkdirAll(filepath.Dir(outputPath), 0o755); err != nil { + return fmt.Errorf("create output dir for %q: %w", outputPath, err) + } + + var rendered bytes.Buffer + if err := tmpl.Execute(&rendered, data); err != nil { + return fmt.Errorf("execute template %q: %w", path, err) + } + + if err := os.WriteFile(outputPath, rendered.Bytes(), 0o644); err != nil { + return fmt.Errorf("write rendered manifest %q: %w", outputPath, err) + } + + return nil + }) +} diff --git a/hack/orca/.gitignore b/hack/orca/.gitignore new file mode 100644 index 00000000..e19a8c5e --- /dev/null +++ b/hack/orca/.gitignore @@ -0,0 +1,3 @@ +# Dev-only artifacts; never committed. +rendered-dev/ +.env diff --git a/hack/orca/Makefile b/hack/orca/Makefile new file mode 100644 index 00000000..92f0f171 --- /dev/null +++ b/hack/orca/Makefile @@ -0,0 +1,310 @@ +# hack/orca/Makefile - dev-harness targets for the Orca origin cache. +# +# Invoke from the repo root: `make -C hack/orca `. The root +# Makefile also defines `orca-up`, `orca-down`, `orca-reset` which +# proxy here. +# +# These targets stand up a local Kind cluster, build the Orca container +# image with podman, side-load it into Kind, deploy LocalStack as the +# cachestore backend, and apply the rendered Orca manifests. The +# harness validates the Kubernetes deployment shape (manifests, image, +# headless-Service DNS, RBAC, init-Job ordering); for Go-level +# behavior coverage use `make orca-inttest` which runs the in-process +# integration suite under internal/orca/inttest/. + +REPO_ROOT := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))/../..) +HACK_DIR := $(dir $(lastword $(MAKEFILE_LIST))) + +# Cluster + namespace knobs. +CLUSTER_NAME ?= orca-dev +NAMESPACE ?= unbounded-kube +KIND_CONFIG ?= $(HACK_DIR)kind-config.yaml + +# Image tag pinned to :dev so kind load and rollout-restart use a +# stable identifier (the auto-derived VERSION can include slashes from +# git tags like images/agent-ubuntu2404-nvidia/v..., which are illegal +# in OCI tags). +ORCA_VERSION ?= dev +ORCA_IMAGE ?= ghcr.io/azure/orca:$(ORCA_VERSION) + +# Container engine (podman in CI, podman or docker locally). kind load +# image-archive accepts an OCI tarball produced by either. +CONTAINER_ENGINE ?= podman + +# Path to user .env (sourced by helper scripts that need it). +ENV_FILE ?= $(HACK_DIR).env + +# Rendered manifest dirs (per-Makefile target overrides for the dev +# rendering of pluggable orca manifests + the dev-only LocalStack/init +# manifests). +ORCA_RENDERED := $(REPO_ROOT)/deploy/orca/rendered +DEV_TEMPLATES := $(REPO_ROOT)/deploy/orca/dev +DEV_RENDERED := $(HACK_DIR)rendered-dev + +.PHONY: help up down reset render render-dev image kind-create kind-load \ + deploy deploy-localstack deploy-azurite deploy-azurite-maybe \ + deploy-credentials deploy-orca \ + wait-ready logs port-forward seed-azure status \ + seed-generate seed-upload seed-list seed-delete + +help: ## Show this help + @echo "" + @echo "Usage: make -C hack/orca [VAR=value ...]" + @echo "" + @echo "Lifecycle:" + @echo " up Bring up Kind cluster + LocalStack + Orca" + @echo " down Delete Kind cluster" + @echo " reset Rebuild image + rollout-restart deployment" + @echo "" + @echo "Pieces (typically called by 'up'):" + @echo " render Render orca manifests" + @echo " render-dev Render dev-only manifests (LocalStack, init job)" + @echo " image Build Orca container image (image-orca-local)" + @echo " kind-create Create the Kind cluster (idempotent)" + @echo " kind-load Load the Orca image into Kind nodes" + @echo " deploy-localstack Apply LocalStack Deployment + bucket init Job" + @echo " deploy-credentials Create the orca-credentials Secret from .env" + @echo " deploy-orca Apply rendered Orca manifests" + @echo " wait-ready Block until 3/3 orca pods are Ready" + @echo "" + @echo "Operate:" + @echo " status kubectl get pods -n $(NAMESPACE)" + @echo " logs Tail logs from all Orca pods" + @echo " port-forward Forward localhost:8443 -> svc/orca" + @echo " seed-azure Upload a file to real Azure (FILE=path; requires .env creds)" + @echo "" + @echo "Seed origin (Azurite via NodePort 30100; needs cluster up + ORIGIN_DRIVER=azureblob):" + @echo " seed-generate SEED_ARGS='--size 10MiB --count 5' Synthesise N blobs of size S" + @echo " seed-upload FILE=/path/to/file Upload a single file" + @echo " seed-list [SEED_ARGS='--prefix foo'] List blobs in the container" + @echo " seed-delete [PREFIX=foo] [SEED_ARGS='--yes'] Delete blobs (interactive by default)" + @echo "" + @echo "Note: For Go-level behavior testing (chunked GETs, cluster routing," + @echo "singleflight, peer fallback) use 'make orca-inttest' from the repo" + @echo "root. That suite exercises the same code paths against testcontainers" + @echo "without needing Kind." + @echo "" + @echo "Variables:" + @echo " CLUSTER_NAME=$(CLUSTER_NAME)" + @echo " NAMESPACE=$(NAMESPACE)" + @echo " ORCA_IMAGE=$(ORCA_IMAGE)" + @echo " CONTAINER_ENGINE=$(CONTAINER_ENGINE)" + @echo " ENV_FILE=$(ENV_FILE)" + +# -- Top-level lifecycle ------------------------------------------------------ + +up: kind-create image kind-load deploy ## End-to-end bring-up + +down: ## Delete Kind cluster + CLUSTER_NAME="$(CLUSTER_NAME)" $(HACK_DIR)down.sh + +reset: image kind-load ## Rebuild image and rolling-restart Orca + kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) rollout restart deployment/orca + kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) rollout status deployment/orca --timeout=120s + +# `deploy` deploys whichever origin backend matches ORIGIN_DRIVER in +# .env (default: awss3 -> LocalStack only; azureblob also brings up +# Azurite). The cachestore is always LocalStack regardless. Init Jobs +# are idempotent so re-applying is safe. +deploy: render render-dev deploy-localstack deploy-azurite-maybe deploy-credentials deploy-orca wait-ready ## Apply all manifests + Secret + +deploy-azurite-maybe: render-dev + @if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \ + driver="$${ORIGIN_DRIVER:-awss3}"; \ + if [ "$$driver" = "azureblob" ]; then \ + echo "ORIGIN_DRIVER=azureblob -> deploying Azurite"; \ + $(MAKE) deploy-azurite; \ + else \ + echo "ORIGIN_DRIVER=$$driver -> Azurite not required (skipping)"; \ + fi + +# -- Rendering ---------------------------------------------------------------- + +# Render the pluggable orca manifests with the dev image. Default +# origin driver in the dev harness is awss3 pointing at the same +# in-cluster LocalStack instance (different bucket); reviewers can +# override by setting ORIGIN_DRIVER=azureblob and the appropriate +# AZURE_* values in .env. Credentials are NOT rendered into the +# ConfigMap; they ride in via the orca-credentials Secret as env vars +# (envFrom). +render: + @echo "Rendering orca manifests with image=$(ORCA_IMAGE)" + @mkdir -p "$(ORCA_RENDERED)" + @find "$(ORCA_RENDERED)" -mindepth 1 -not -name .gitignore -delete 2>/dev/null || true + @if [ -f "$(ENV_FILE)" ]; then \ + set -a && . "$(ENV_FILE)" && set +a; \ + fi; \ + driver="$${ORIGIN_DRIVER:-awss3}"; \ + if [ "$$driver" = "azureblob" ]; then \ + azure_account="$${AZURE_STORAGE_ACCOUNT:-devstoreaccount1}"; \ + azure_container="$${AZURE_CONTAINER:-$${AZURITE_CONTAINER:-orca-test}}"; \ + azure_endpoint="$${AZUREBLOB_ENDPOINT:-http://azurite.$(NAMESPACE).svc.cluster.local:10000/devstoreaccount1/}"; \ + else \ + azure_account="$${AZURE_STORAGE_ACCOUNT:-}"; \ + azure_container="$${AZURE_CONTAINER:-orca-test}"; \ + azure_endpoint="$${AZUREBLOB_ENDPOINT:-}"; \ + fi; \ + go run "$(REPO_ROOT)/hack/cmd/render-manifests" \ + --templates-dir "$(REPO_ROOT)/deploy/orca" \ + --output-dir "$(ORCA_RENDERED)" \ + --set Namespace="$(NAMESPACE)" \ + --set Image="$(ORCA_IMAGE)" \ + --set ImagePullPolicy=IfNotPresent \ + --set TargetReplicas="$${TARGET_REPLICAS:-3}" \ + --set OriginID="$${ORIGIN_ID:-awss3-localstack}" \ + --set OriginDriver="$$driver" \ + --set AzureAccount="$$azure_account" \ + --set AzureContainer="$$azure_container" \ + --set AzureEndpoint="$$azure_endpoint" \ + --set OriginAWSS3Endpoint="$${ORIGIN_AWSS3_ENDPOINT:-http://localstack.$(NAMESPACE).svc.cluster.local:4566}" \ + --set OriginAWSS3Region="$${ORIGIN_AWSS3_REGION:-us-east-1}" \ + --set OriginAWSS3Bucket="$${ORIGIN_AWSS3_BUCKET:-orca-origin}" \ + --set OriginAWSS3UsePathStyle="true" \ + --set CachestoreBucket="$${CACHESTORE_BUCKET:-orca-cache}" \ + --set CachestoreEndpoint="$${CACHESTORE_ENDPOINT:-http://localstack.$(NAMESPACE).svc.cluster.local:4566}" \ + --set CachestoreRegion="$${CACHESTORE_REGION:-us-east-1}" \ + --set ClusterService="orca-peers.$(NAMESPACE).svc.cluster.local" \ + --set ServerAuthEnabled=false \ + --set InternalTLSEnabled=false \ + --set LogLevel="$${LOG_LEVEL:-info}" + +render-dev: + @echo "Rendering dev manifests (LocalStack, init job, Azurite)" + @mkdir -p "$(DEV_RENDERED)" + @find "$(DEV_RENDERED)" -mindepth 1 -delete 2>/dev/null || true + @if [ -f "$(ENV_FILE)" ]; then \ + set -a && . "$(ENV_FILE)" && set +a; \ + fi; \ + go run "$(REPO_ROOT)/hack/cmd/render-manifests" \ + --templates-dir "$(DEV_TEMPLATES)" \ + --output-dir "$(DEV_RENDERED)" \ + --set Namespace="$(NAMESPACE)" \ + --set CachestoreBucket="$${CACHESTORE_BUCKET:-orca-cache}" \ + --set OriginBucket="$${ORIGIN_AWSS3_BUCKET:-orca-origin}" \ + --set AzuriteContainer="$${AZURE_CONTAINER:-$${AZURITE_CONTAINER:-orca-test}}" \ + --set AzuriteNodePort="$${AZURITE_NODE_PORT:-30100}" + +# -- Image + cluster ---------------------------------------------------------- + +image: + @echo "Building Orca image $(ORCA_IMAGE) with $(CONTAINER_ENGINE)" + cd "$(REPO_ROOT)" && $(MAKE) image-orca-local \ + VERSION=$(ORCA_VERSION) \ + CONTAINER_ENGINE=$(CONTAINER_ENGINE) \ + ORCA_IMAGE=$(ORCA_IMAGE) + +kind-create: + CLUSTER_NAME="$(CLUSTER_NAME)" KIND_CONFIG="$(KIND_CONFIG)" $(HACK_DIR)kind-create.sh + +kind-load: + CLUSTER_NAME="$(CLUSTER_NAME)" \ + ORCA_IMAGE="$(ORCA_IMAGE)" \ + CONTAINER_ENGINE="$(CONTAINER_ENGINE)" \ + $(HACK_DIR)kind-load.sh + +# -- Deploy steps ------------------------------------------------------------- + +deploy-localstack: render-dev render + kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/01-namespace.yaml" + kubectl --context kind-$(CLUSTER_NAME) apply -f "$(DEV_RENDERED)/01-localstack.yaml" + kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) rollout status deployment/localstack --timeout=120s + kubectl --context kind-$(CLUSTER_NAME) apply -f "$(DEV_RENDERED)/02-init-job.yaml" + kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) wait --for=condition=complete job/orca-buckets-init --timeout=120s + +deploy-azurite: render-dev + kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/01-namespace.yaml" + kubectl --context kind-$(CLUSTER_NAME) apply -f "$(DEV_RENDERED)/03-azurite.yaml" + kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) rollout status deployment/azurite --timeout=180s + kubectl --context kind-$(CLUSTER_NAME) apply -f "$(DEV_RENDERED)/04-azurite-init.yaml" + kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) wait --for=condition=complete job/orca-azurite-container-init --timeout=180s + +deploy-credentials: + CLUSTER_NAME="$(CLUSTER_NAME)" \ + NAMESPACE="$(NAMESPACE)" \ + ENV_FILE="$(ENV_FILE)" \ + $(HACK_DIR)deploy-credentials.sh + +deploy-orca: render + kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/02-rbac.yaml" + kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/03-config.yaml" + # Service before Deployment: the headless orca-peers Service must + # exist (with its DNS A-records) before the pods start so the + # initial cluster.refresh sees the full peer set instead of + # bootstrapping into the self-only fallback. + kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/05-service.yaml" + kubectl --context kind-$(CLUSTER_NAME) apply -f "$(ORCA_RENDERED)/04-deployment.yaml" + +wait-ready: + kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) rollout status deployment/orca --timeout=180s + +# -- Operate ------------------------------------------------------------------ + +status: + kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) get pods -o wide + +logs: + kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) logs -l app.kubernetes.io/name=orca --tail=200 -f + +port-forward: + @echo "Forwarding localhost:8443 -> svc/orca:8443 ..." + kubectl --context kind-$(CLUSTER_NAME) -n $(NAMESPACE) port-forward svc/orca 8443:8443 + +seed-azure: ## Upload a file to real Azure (requires AZURE_STORAGE_* in .env; pass FILE=...) + @[ -n "$(FILE)" ] || { echo "Usage: make seed-azure FILE=/path/to/file [SEED_ARGS='--name foo']" >&2; exit 1; } + @if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \ + [ -n "$${AZURE_STORAGE_ACCOUNT:-}" ] || { echo "AZURE_STORAGE_ACCOUNT not set in $(ENV_FILE)" >&2; exit 1; }; \ + [ -n "$${AZURE_STORAGE_KEY:-}" ] || { echo "AZURE_STORAGE_KEY not set in $(ENV_FILE)" >&2; exit 1; }; \ + [ -n "$${AZURE_CONTAINER:-}" ] || { echo "AZURE_CONTAINER not set in $(ENV_FILE)" >&2; exit 1; }; \ + go run "$(REPO_ROOT)/hack/cmd/orcaseed" upload \ + --endpoint "https://$${AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/" \ + --account "$${AZURE_STORAGE_ACCOUNT}" \ + --account-key "$${AZURE_STORAGE_KEY}" \ + --container "$${AZURE_CONTAINER}" \ + --file "$(FILE)" \ + $(SEED_ARGS) + +# -- Seeder (orcaseed) helpers ------------------------------------------------ +# +# These targets invoke hack/cmd/orcaseed against the in-cluster Azurite +# emulator exposed on the host loopback via the NodePort 30100 baked +# into deploy/orca/dev/03-azurite.yaml.tmpl. Override AZURITE_NODE_PORT +# in .env if you've bumped the NodePort to avoid a host-port conflict. +# Pass extra flags via SEED_ARGS, e.g.: +# +# make -C hack/orca seed-generate SEED_ARGS='--size 10MiB --count 5' +# make -C hack/orca seed-upload FILE=~/data.tar.gz +# make -C hack/orca seed-list +# make -C hack/orca seed-delete PREFIX=synth- SEED_ARGS='--yes' + +SEED_ENDPOINT ?= http://localhost:$${AZURITE_NODE_PORT:-30100}/devstoreaccount1/ + +seed-generate: ## Generate synthetic blobs and upload to the Azurite origin + @if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \ + go run "$(REPO_ROOT)/hack/cmd/orcaseed" generate \ + --endpoint "$(SEED_ENDPOINT)" \ + --container "$${AZURE_CONTAINER:-orca-test}" \ + $(SEED_ARGS) + +seed-upload: ## Upload a file to the Azurite origin (use FILE=/path/to/file) + @[ -n "$(FILE)" ] || { echo "Usage: make seed-upload FILE=/path/to/file [SEED_ARGS='--name foo']" >&2; exit 1; } + @if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \ + go run "$(REPO_ROOT)/hack/cmd/orcaseed" upload \ + --endpoint "$(SEED_ENDPOINT)" \ + --container "$${AZURE_CONTAINER:-orca-test}" \ + --file "$(FILE)" \ + $(SEED_ARGS) + +seed-list: ## List blobs in the Azurite origin container + @if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \ + go run "$(REPO_ROOT)/hack/cmd/orcaseed" list \ + --endpoint "$(SEED_ENDPOINT)" \ + --container "$${AZURE_CONTAINER:-orca-test}" \ + $(SEED_ARGS) + +seed-delete: ## Delete blobs from the Azurite origin container (use PREFIX=foo) + @if [ -f "$(ENV_FILE)" ]; then set -a && . "$(ENV_FILE)" && set +a; fi; \ + go run "$(REPO_ROOT)/hack/cmd/orcaseed" delete \ + --endpoint "$(SEED_ENDPOINT)" \ + --container "$${AZURE_CONTAINER:-orca-test}" \ + --prefix "$(PREFIX)" \ + $(SEED_ARGS) diff --git a/hack/orca/deploy-credentials.sh b/hack/orca/deploy-credentials.sh new file mode 100755 index 00000000..0d8d8045 --- /dev/null +++ b/hack/orca/deploy-credentials.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# deploy-credentials.sh - create the orca-credentials Secret holding +# Azure Blob and S3 cachestore credentials. Sourced from .env so secret +# values never land in YAML. +# +# The dev harness defaults to ORIGIN_DRIVER=awss3 (LocalStack as both +# origin and cachestore), in which case AZURE_STORAGE_KEY is optional +# and the Azure key is omitted from the Secret. If you switch to +# ORIGIN_DRIVER=azureblob, AZURE_STORAGE_KEY becomes required. +set -euo pipefail + +CLUSTER_NAME=${CLUSTER_NAME:?CLUSTER_NAME must be set} +NAMESPACE=${NAMESPACE:?NAMESPACE must be set} +ENV_FILE=${ENV_FILE:?ENV_FILE must be set} + +if [[ -f "${ENV_FILE}" ]]; then + set -a + # shellcheck disable=SC1090 + . "${ENV_FILE}" + set +a +else + echo "Note: ${ENV_FILE} not found; proceeding with default awss3 origin (LocalStack)." +fi + +ORIGIN_DRIVER=${ORIGIN_DRIVER:-awss3} + +# LocalStack accepts any non-empty creds; pin to test/test for parity +# with manual aws-cli calls in the init Job. Both the cachestore and +# (when the awss3 origin driver targets in-cluster LocalStack) the +# origin use the same creds. +ORCA_CACHESTORE_S3_ACCESS_KEY=${ORCA_CACHESTORE_S3_ACCESS_KEY:-test} +ORCA_CACHESTORE_S3_SECRET_KEY=${ORCA_CACHESTORE_S3_SECRET_KEY:-test} +ORCA_AWSS3_ACCESS_KEY=${ORCA_AWSS3_ACCESS_KEY:-test} +ORCA_AWSS3_SECRET_KEY=${ORCA_AWSS3_SECRET_KEY:-test} + +# Build the kubectl literal flags conditionally so we don't ship empty +# strings as Azure keys in awss3 mode. +literals=( + "--from-literal=ORCA_CACHESTORE_S3_ACCESS_KEY=${ORCA_CACHESTORE_S3_ACCESS_KEY}" + "--from-literal=ORCA_CACHESTORE_S3_SECRET_KEY=${ORCA_CACHESTORE_S3_SECRET_KEY}" + "--from-literal=ORCA_AWSS3_ACCESS_KEY=${ORCA_AWSS3_ACCESS_KEY}" + "--from-literal=ORCA_AWSS3_SECRET_KEY=${ORCA_AWSS3_SECRET_KEY}" +) + +case "${ORIGIN_DRIVER}" in + azureblob) + # In azureblob+Azurite mode (no real Azure account), fall back to + # the well-known Azurite dev key. This is a public, documented + # constant baked into Azurite -- not a secret. + # + # Gate the fallback on AZURE_STORAGE_ACCOUNT being empty or the + # well-known Azurite account name. If the operator set a real + # account but forgot the key, hard-fail rather than silently + # injecting the Azurite dev key into the Secret (which would + # auth-fail at runtime against the real account and obscure the + # real problem). + if [[ -z "${AZURE_STORAGE_KEY:-}" ]]; then + case "${AZURE_STORAGE_ACCOUNT:-}" in + ""|"devstoreaccount1") + AZURITE_DEV_KEY="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" + echo "AZURE_STORAGE_KEY not set; using Azurite well-known dev key (account: devstoreaccount1)." + AZURE_STORAGE_KEY="${AZURITE_DEV_KEY}" + ;; + *) + echo "ERROR: AZURE_STORAGE_KEY is required when AZURE_STORAGE_ACCOUNT=${AZURE_STORAGE_ACCOUNT}." >&2 + echo "The Azurite well-known dev key fallback only applies to account 'devstoreaccount1'." >&2 + exit 1 + ;; + esac + fi + literals+=("--from-literal=ORCA_AZUREBLOB_ACCOUNT_KEY=${AZURE_STORAGE_KEY}") + ;; + awss3) + if [[ -n "${AZURE_STORAGE_KEY:-}" ]]; then + # Allow it to be present so reviewers can switch drivers without + # editing secrets each time. + literals+=("--from-literal=ORCA_AZUREBLOB_ACCOUNT_KEY=${AZURE_STORAGE_KEY}") + fi + ;; + *) + echo "ERROR: unknown ORIGIN_DRIVER=${ORIGIN_DRIVER}" >&2 + exit 1 + ;; +esac + +echo "Creating/updating Secret orca-credentials in namespace ${NAMESPACE} (origin driver: ${ORIGIN_DRIVER}) ..." +kubectl --context "kind-${CLUSTER_NAME}" -n "${NAMESPACE}" create secret generic orca-credentials \ + "${literals[@]}" \ + --dry-run=client -o yaml | kubectl --context "kind-${CLUSTER_NAME}" apply -f - + +echo "orca-credentials Secret applied." diff --git a/hack/orca/dev-harness.md b/hack/orca/dev-harness.md new file mode 100644 index 00000000..5147dff9 --- /dev/null +++ b/hack/orca/dev-harness.md @@ -0,0 +1,336 @@ + + +# Orca Dev Harness + +A local end-to-end harness for the Orca origin cache. Stands up a Kind +cluster with three Orca replicas, an in-cluster LocalStack as the +cachestore, and an in-cluster origin (LocalStack S3 by default; Azurite +when `ORIGIN_DRIVER=azureblob`). Both default paths run with zero real +cloud credentials. The harness can also be flipped to point at a real +Azure Blob storage account. + +This document covers a single workstation. For the production +architecture and design rationale, see `design/orca/`. For Go-level +integration tests that exercise the same code paths without Kubernetes +(via testcontainers-managed LocalStack and Azurite), see +[inttest.md](./inttest.md). The two harnesses are complementary: this +one validates the K8s deployment shape (manifests, headless DNS, image +build/load); the integration tests cover the Go runtime behavior. + +## Origin modes + +| `ORIGIN_DRIVER` value | Origin backend | Driver path exercised | Creds needed | +| --------------------- | -------------- | --------------------- | ------------ | +| `awss3` (default) | LocalStack S3 (in-cluster) | `internal/orca/origin/awss3` | None | +| `azureblob` (Azurite) | Azurite (in-cluster) | `internal/orca/origin/azureblob` | None (well-known dev key) | +| `azureblob` (real Azure) | Azure Blob Storage | `internal/orca/origin/azureblob` | Account + key in `.env` | + +The cachestore is always in-cluster LocalStack S3 (different bucket +from the awss3 origin). + +## What you get + +- A Kind cluster named `orca-dev` with one control plane and three + worker nodes (one per Orca replica via required pod-anti-affinity). +- LocalStack 3.8 running in the cluster as the S3-compatible + cachestore (and origin in `awss3` mode). Community tier (`latest` + is Pro-only and exits with code 55 "License activation failed"). +- Azurite (Microsoft's official Azure Storage emulator) deployed on + demand when `ORIGIN_DRIVER=azureblob`. Runs from + `mcr.microsoft.com/azure-storage/azurite`. +- Buckets/containers pre-created by init Jobs: + - `orca-cache` (S3) - cachestore (versioning unset; Orca's + versioningGate rejects Enabled and Suspended). + - `orca-origin` (S3) - origin (used when `ORIGIN_DRIVER=awss3`). + - `orca-test` (Azure container) - origin (used when `ORIGIN_DRIVER=azureblob`). +- Three Orca replicas. mTLS between peers and bearer auth for + clients are both disabled in dev (`cluster.internal_tls.enabled=false`, + `server.auth.enabled=false`). +- Helper scripts (seed sample blobs, GET, LIST, clear cache, tail logs). + +## Prerequisites + +- `kind` (https://kind.sigs.k8s.io/), `kubectl`, `podman` (or `docker`). +- `go` toolchain (for `go run ./hack/cmd/render-manifests`). +- Optional (Azure mode only): a real Azure Storage account + container + + account key. + +No real cloud credentials are required for the default flow. + +## One-time setup + +```bash +cp hack/orca/.env.example hack/orca/.env +# Default values work; only edit if you want Azure mode. +``` + +`.env` is git-ignored. The default `ORIGIN_DRIVER=awss3` runs entirely +on the in-cluster LocalStack. + +## Bring it up + +```bash +make -C hack/orca up +``` + +This runs, in order: + +1. `kind-create` - create the `orca-dev` cluster (idempotent). +2. `image` - build `ghcr.io/azure/orca:dev` via `make image-orca-local`. +3. `kind-load` - save the image to a tar and `kind load image-archive`. +4. `render` - render `deploy/orca/*.yaml.tmpl` with values from `.env`. +5. `render-dev` - render `deploy/orca/dev/*.yaml.tmpl` (LocalStack, Azurite, init Jobs). +6. `deploy-localstack` - apply the namespace, LocalStack, wait until + ready, run the bucket-init Job (creates `orca-cache` + `orca-origin`), + wait for completion. +7. `deploy-azurite-maybe` - if `ORIGIN_DRIVER=azureblob`, deploy + Azurite + run its container-init Job. Skipped for `awss3`. +8. `deploy-credentials` - create the `orca-credentials` Secret. +9. `deploy-orca` - apply RBAC, ConfigMap, Services, Deployment. +10. `wait-ready` - block until all 3 replicas are Ready. + +When this finishes you should see something like: + +``` +$ make -C hack/orca status +NAME READY STATUS RESTARTS AGE +azurite-... 1/1 Running 0 1m (only in azureblob mode) +localstack-... 1/1 Running 0 1m +orca-azurite-container-init-... 0/1 Completed 0 1m (only in azureblob mode) +orca-buckets-init-... 0/1 Completed 0 1m +orca-7c5d4f9b8c-... 1/1 Running 0 50s +orca-7c5d4f9b8c-... 1/1 Running 0 50s +orca-7c5d4f9b8c-... 1/1 Running 0 50s +``` + +## Switching origins + +Edit `hack/orca/.env`, change `ORIGIN_DRIVER`, then: + +```bash +make -C hack/orca down +make -C hack/orca up +``` + +Or, to keep the cluster but reconfigure Orca and pull in any newly +needed backends: + +```bash +$EDITOR hack/orca/.env +make -C hack/orca deploy # idempotent; brings up Azurite if needed +make -C hack/orca reset # rolling-restart Orca with new ConfigMap +``` + +## Seed sample data + +The dev harness ships a small Go tool, `hack/cmd/orcaseed`, that +populates the origin container (Azurite or real Azure) with synthetic +or operator-supplied content. For the canonical recipe (Azurite +endpoint via NodePort 30100, the four subcommands wrapped as Make +targets, the per-blob ceiling, etc.) see +[quickstart.md - Step 3](./quickstart.md#step-3---seed-the-origin). + +For real Azure storage, the `seed-azure` Make target invokes +`orcaseed upload` against your account using credentials from `.env`: + +```bash +make -C hack/orca seed-azure FILE=/path/to/local-file +``` + +This replaces the legacy `seed-azure.sh` script (retired). Required +in `.env`: `AZURE_STORAGE_ACCOUNT`, `AZURE_STORAGE_KEY`, +`AZURE_CONTAINER`. The endpoint is computed as +`https://.blob.core.windows.net/`. + +For ad-hoc seeding into the in-cluster LocalStack S3 origin (the +default `awss3` mode), `orcaseed` does not currently speak S3; use a +one-off Job: + +```bash +kubectl --context kind-orca-dev -n unbounded-kube run orca-seed --rm -it \ + --image=amazon/aws-cli:latest --restart=Never \ + --env=AWS_ACCESS_KEY_ID=test \ + --env=AWS_SECRET_ACCESS_KEY=test \ + -- \ + --endpoint-url http://localstack.unbounded-kube.svc.cluster.local:4566 \ + s3 cp /tmp/your-file s3://orca-origin/your-key +``` + +## Exercise the cache + +See [quickstart.md - Steps 4-5](./quickstart.md#step-4---port-forward-the-orca-edge) +for the port-forward + `curl` walkthrough. The cluster-wide +deduplication, singleflight collapse, and warm-cache behavior are +verified deterministically by `make orca-inttest` against +testcontainers; this Kind harness is for validating the Kubernetes +deployment shape (manifests, image, headless DNS, RBAC, init-Job +ordering) and for ad-hoc operator exploration. + +## See cluster-wide deduplication in action + +The integration test `TestSingleflightCollapse` (under +`internal/orca/inttest/`) deterministically asserts this behavior +with byte-exact body checks and a `CountingOrigin` decorator. To +reproduce manually against this harness, fire concurrent GETs of a +fresh blob and tail the logs: + +```bash +make -C hack/orca logs +``` + +You should see exactly one chunk-fill per chunk-key across the +cluster (coordinator selected by rendezvous-hash). Replicas that +received the client request but are not the coordinator forward via +`/internal/fill`. Once a chunk is committed to the cachestore, +subsequent GETs (and joiners that arrived during the fill) read from +cache. + +## Switching to Azure mode (real Azure) + +Edit `hack/orca/.env` and set: + +``` +ORIGIN_DRIVER=azureblob +ORIGIN_ID=azureblob-real +AZURE_STORAGE_ACCOUNT= +AZURE_STORAGE_KEY= +AZURE_CONTAINER= +AZUREBLOB_ENDPOINT= # leave blank for real Azure +``` + +Then: + +```bash +make -C hack/orca deploy # idempotent +make -C hack/orca seed-azure FILE=/path/to/file # uploads via orcaseed -> real Azure +make -C hack/orca reset +``` + +The `seed-azure` target uses `hack/cmd/orcaseed` under the hood, +constructing the endpoint as `https://.blob.core.windows.net/` +and authenticating with `AZURE_STORAGE_KEY`. Pass `SEED_ARGS='--name foo'` +to override the destination blob name. + +## Reset / iterate + +```bash +# Rebuild the image and rolling-restart the deployment: +make -C hack/orca reset + +# Tear down the whole Kind cluster: +make -C hack/orca down +``` + +To clear the cachestore bucket between manual experiments, exec into +the LocalStack pod or run a one-off `aws s3 rm s3://orca-cache --recursive` +job; the prior canned script was retired alongside the seeding helpers. + +## Logging + +The Orca pods default to info-level structured JSON logging. Set +`LOG_LEVEL=debug` in `hack/orca/.env` (then `make -C hack/orca deploy +&& make -C hack/orca reset`) for persistent per-chunk debug tracing, +or `kubectl set env deployment/orca ORCA_LOG_LEVEL=debug` for a +one-off runtime override. See +[quickstart.md - Step 6](./quickstart.md#step-6---watch-the-per-chunk-debug-trace) +for the structured-log shape and `jq` filter examples. + +## Troubleshooting + +### `localstack` deployment never goes Ready + +Check the LocalStack pod's logs: + +```bash +kubectl --context kind-orca-dev -n unbounded-kube logs deploy/localstack +``` + +If you see "License activation failed" with exit code 55, you're on the +Pro-only `latest` tag. The dev harness pins `localstack/localstack:3.8` +specifically to avoid this. + +### `azurite` deployment never goes Ready (azureblob mode) + +Check the Azurite logs: + +```bash +kubectl --context kind-orca-dev -n unbounded-kube logs deploy/azurite +``` + +Most commonly the readiness probe is failing because Azurite was +launched with `--blobHost 127.0.0.1` (default) instead of `0.0.0.0`. +The harness's manifest already passes the right flag; if you've +overridden `AzuriteImage` to a custom build, ensure it accepts the +flag. + +### `orca-buckets-init` Job fails + +The Job waits up to 120 seconds for LocalStack readiness, then creates +both `orca-cache` and `orca-origin` and verifies cachestore versioning +is unset. Failures are typically LocalStack startup taking longer than +that on a slow disk; rerun the Job: + +```bash +kubectl --context kind-orca-dev -n unbounded-kube delete job orca-buckets-init --ignore-not-found +make -C hack/orca deploy-localstack +``` + +### Orca pods CrashLoopBackOff with "config invalid: ..." + +Check what's missing: + +```bash +kubectl --context kind-orca-dev -n unbounded-kube logs deploy/orca | head +``` + +Common causes: +- In Azure mode, an empty `AZURE_STORAGE_ACCOUNT`/`AZURE_CONTAINER` + (rendered into the ConfigMap). +- A missing `orca-credentials` Secret. + +Fix: + +```bash +$EDITOR hack/orca/.env +make -C hack/orca render # re-render ConfigMap from .env +make -C hack/orca deploy-credentials +kubectl --context kind-orca-dev -n unbounded-kube apply -f deploy/orca/rendered/03-config.yaml +make -C hack/orca reset +``` + +### "OriginUnreachable" or 502 from manual GETs + +In awss3 (default) mode: +- The bucket name in the URL must match `ORIGIN_AWSS3_BUCKET` (default + `orca-origin`). +- Seed the bucket manually with `kubectl run orca-seed --rm -it + --image=amazon/aws-cli:latest -- ...`. + +In Azure mode: +- Account key wrong or revoked. Re-run `make -C hack/orca deploy-credentials && make -C hack/orca reset`. +- The blob doesn't exist in `$AZURE_CONTAINER`. Run `make -C hack/orca seed-azure`. + +### kind load fails with "tag not found" + +The `make image` target tags the image as `ghcr.io/azure/orca:dev` (the +default `ORCA_VERSION=dev`). If you overrode `VERSION` and got a slash +in the tag (git describe can produce e.g. +`images/agent-ubuntu2404-nvidia/v...-dirty`), the OCI tag is invalid. +Stick with `ORCA_VERSION=dev` for the dev harness. + +## What this harness does NOT cover + +- `cachestore/posixfs` and `cachestore/localfs` drivers (deferred; v1 + prototype has only `cachestore/s3`). +- Production auth (bearer tokens, mTLS edge, internal mTLS). All three + are disabled by config in dev. +- Edge rate limiting and dynamic per-replica origin caps (see s15 + deferred-optimizations in `design/orca/design.md`). +- Mid-stream origin resume; if origin stalls after first byte the + client sees a truncated body. Acceptable for the prototype. +- Crash recovery / unowned-key sweep (post-MVP). + +For more on what's in vs out of scope, see `design/orca/design.md` +(in particular the +[Deferred / future work](../../designs/orca/design.md#15-deferred--future-work) +section). diff --git a/hack/orca/down.sh b/hack/orca/down.sh new file mode 100755 index 00000000..3d59a7c8 --- /dev/null +++ b/hack/orca/down.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# down.sh - delete the Orca dev Kind cluster. +set -euo pipefail + +CLUSTER_NAME=${CLUSTER_NAME:?CLUSTER_NAME must be set} + +if ! command -v kind >/dev/null 2>&1; then + echo "kind is not installed; nothing to do." >&2 + exit 0 +fi + +if ! kind get clusters 2>/dev/null | grep -qx "${CLUSTER_NAME}"; then + echo "No Kind cluster named '${CLUSTER_NAME}'; nothing to delete." + exit 0 +fi + +echo "Deleting Kind cluster '${CLUSTER_NAME}' ..." +kind delete cluster --name "${CLUSTER_NAME}" diff --git a/hack/orca/inttest.md b/hack/orca/inttest.md new file mode 100644 index 00000000..29a737d2 --- /dev/null +++ b/hack/orca/inttest.md @@ -0,0 +1,215 @@ + + +# Orca Integration Tests + +In-process integration tests for the Orca origin cache. The harness +brings up real LocalStack and Azurite containers via +`testcontainers-go` and constructs N in-process `*app.App` instances +wired to those containers. No Kubernetes cluster is required. + +For the Kubernetes-flavored deployment validation harness (Kind + +manifests + headless DNS), see [dev-harness.md](./dev-harness.md). The +two harnesses are complementary: the integration tests cover Go-level +behavior (origin, cachestore, fetch coordinator, cluster routing, +internal-fill RPC); the dev harness covers the manifest + deployment +shape. + +## Prerequisites + +- Docker (or any `DOCKER_HOST`-compatible daemon) reachable from the + test process. `testcontainers-go` discovers it via `DOCKER_HOST`, + `~/.docker/`, or the standard socket location. +- `gcc` for `-race` (CGO is required by Go's race detector). On + GitHub-hosted Ubuntu runners this is preinstalled. Locally without + `gcc`, the Makefile target drops `-race` automatically. + +## Running + +```sh +make orca-inttest +``` + +Equivalent to: + +```sh +go test -tags=integrationtest -timeout 15m ./internal/orca/inttest/... +# CI also adds -race +``` + +First run pulls `localstack/localstack:3.8` (~700 MB) and +`mcr.microsoft.com/azure-storage/azurite:3.34.0` (~150 MB). Subsequent +runs reuse the cached images. Total run time on a warm runner is on +the order of 25-30 seconds for the entire suite (most of which is +streaming the 64 MiB multi-chunk blob through the full origin -> +fetch coordinator -> cachestore pipeline). + +## Topology + +Every test (except the lifecycle tests) runs against a 3-replica +in-process cluster, matching the production `deploy/orca` topology. +All replicas bind to `127.0.0.1` with distinct OS-assigned internal +ports. Each replica owns its own `StaticPeerSource` so tests can +mutate one replica's view of the cluster independently. + +``` + ┌──────────────────────────────────────┐ + │ Test Process │ + │ │ + ┌─────────┐ │ ┌──────────┐ ┌───────────────┐ │ + │ Test t │────┼─▶│ Client │───▶│ Replica 1 │ │ + └─────────┘ │ │ (HTTP) │ │ 127.0.0.1:e1 │ │ + │ └──────────┘ │ internal :i1 │ │ + │ └───────┬───────┘ │ + │ ┌─────────────┐ │ peers │ + │ │ Per-replica │◀────────┤ via │ + │ │ Static │ │ static │ + │ │ PeerSources │ │ source │ + │ └─────────────┘ │ │ + │ ┌───────▼───────┐ │ + │ │ Replica 2 │ │ + │ │ 127.0.0.1:e2 │ │ + │ │ internal :i2 │ │ + │ └───────┬───────┘ │ + │ ┌───────▼───────┐ │ + │ │ Replica 3 │ │ + │ │ 127.0.0.1:e3 │ │ + │ │ internal :i3 │ │ + │ └───────┬───────┘ │ + └──────────────────────────┼───────────┘ + │ + ┌──────────────────┴───────────┐ + ▼ ▼ + ┌────────────────┐ ┌────────────┐ + │ LocalStack │ │ Azurite │ + │ (origin S3 + │ │ (origin │ + │ cachestore) │ │ blob) │ + └────────────────┘ └────────────┘ +``` + +## File layout + +``` +internal/orca/inttest/ +├── doc.go package overview, build tag, TODOs +├── images.go pinned container image tags + Azurite dev creds +├── localstack.go testcontainers wrapper + S3 helpers +├── azurite.go testcontainers wrapper + azblob helpers +├── seed.go SmallBlob/MediumBlob/LargeBlob + SeedS3/SeedAzure +├── peersource.go StaticPeerSource (cluster.PeerSource impl) +├── harness.go StartCluster orchestrator +├── client.go typed HTTP helpers (Get / GetRange / Head / List) +├── originwrap.go CountingOrigin decorator +├── internalwrap.go CountingInternalHandlerWrap (per-IP status counts) +├── origins_test.go origin builder helpers +├── main_test.go TestMain (shared LocalStack + Azurite) +├── e2e_test.go canonical 3-replica end-to-end suite +└── azure_test.go azureblob origin smoke (3 replicas) +``` + +Driver-level branch coverage (versioning gate, blob-type rejection) +lives as fast unit tests in the respective driver packages +(`internal/orca/cachestore/s3`, `internal/orca/origin/azureblob`), +not here. Those tests run as part of `go test ./...` and cover all +state branches (empty / Enabled / Suspended versioning; +BlockBlob / PageBlob / AppendBlob / nil / disabled). + +## Test inventory + +The integration suite contains **7 tests** focused exclusively on +behavior that requires real LocalStack/Azurite + a real cluster of +in-process orca instances. Driver-level branch coverage (versioning +gate, blob-type rejection, HTTP error mapping, range parsing, chunk +arithmetic, config env-var fallback, manifest YAML validity) lives as +fast unit tests in the respective packages and runs as part of +`make test`. + +### `e2e_test.go` (3-replica default) + +Tests that exercise chunk fetching naturally exercise both the +local-fill path (when self happens to win rendezvous for a chunk) and +the cross-replica `/internal/fill` path (when a peer wins). + +- `TestColdAndWarmGet` - cold + warm, warm phase deletes origin + object first to prove cache hit. +- `TestRangedGet` - within-chunk and cross-chunk byte ranges plus + several boundary edge cases against a 64-chunk blob (range starts + exactly at a boundary, ends exactly at a boundary, covers + contiguous full chunks, straddles 5 consecutive boundaries). +- `TestMultiChunkGet` - 64 MiB / 64 chunks, byte-exact full GET. With + 3 replicas, statistically every replica is the coordinator for + many chunks, exercising both fillLocal and FillFromPeer paths. +- `TestRendezvousCoordinatorRouting` - GET against a non-coordinator + routes through `/internal/fill`; `CountingOrigin` confirms exactly + one origin GetRange happened cluster-wide. +- `TestSingleflightCollapse` - 3 concurrent GETs from 3 replicas for + the same 64-chunk blob collapse to >= 64 (and <= 76) origin + GetRanges, proving cluster-wide singleflight is genuinely deduping. +- `TestPeerNotCoordinatorFallback` - real membership-disagreement + test. Crafts a phantom peer whose rendezvous score beats the + coord's for k, mutates the coord's `StaticPeerSource` to include + the phantom, GET via a non-coord replica that still views the real + coord as coordinator, asserts (a) byte-exact body and (b) + `counter409.Count(coord) >= 1` proving the 409 fallback fired. + +### `azure_test.go` (3-replica default) + +- `TestAzureBlobOrigin_ColdGet` - the `azureblob` driver works + end-to-end against Azurite for a 2-chunk block blob. + +### Where the dropped scenarios moved + +| Dropped from integration | Lives now as | +|---|---| +| `TestBootSelfTest_Pass` | implicit in every other `StartCluster` test (boots through the same `app.Start` path) | +| `TestNotFound` | `internal/orca/server.TestWriteOriginError` (covers all 5 error mappings) | +| `TestList` | `internal/orca/server.TestHandleList` (covers normal/empty/truncated/error) | +| `TestHead` | `internal/orca/server.TestHandleHead` (covers normal/missing-fields/404) | +| `TestVersionedCachestoreBucketRefused` | `internal/orca/cachestore/s3.TestValidateBucketVersioning` (covers all 3 statuses) | +| `TestAzureUnsupportedBlobType` | `internal/orca/origin/azureblob.TestValidateBlobType` (covers all 5 cases) | + +## Production-code seams used + +The harness depends on three test-friendly seams in production code: + +1. **`cluster.PeerSource`**: replaces the entire peer-discovery + mechanism. Production constructs a DNS-backed source implicitly + from `cfg.Cluster.Service` + `net.DefaultResolver`. Tests inject + per-replica `StaticPeerSource` instances with explicit ports so + multiple replicas can share an IP. + +2. **`cluster.Peer.Port`**: zero in production (peer addressed on + `cfg.Cluster.InternalListen` port); set in tests so `FillFromPeer` + dials each peer's distinct port. + +3. **`internal/orca/app.Start(ctx, *config.Config, ...Option)`**: + programmatic factory wiring origin / cachestore / cluster / fetch + coordinator / edge + internal listeners. Options: + - `WithLogger`, `WithResolver`, `WithPeerSource`, + - `WithOrigin`, `WithCacheStore`, `WithSkipCachestoreSelfTest`, + - `WithInternalHandlerWrap` for the 409 counter. + +Production goes through none of these. + +## Adding a scenario + +1. Pick the right entry point: + - 3-replica e2e (most cases): `StartCluster(ctx, t, opts)`. + - Driver-level branch coverage (versioning gate, blob-type + rejection, etc.): write a unit test in the driver's package + against the extracted pure helpers (`validateBucketVersioning`, + `validateBlobType`). +2. Seed the origin: `SeedS3` or `SeedAzure`. +3. Issue requests via `cl.Get(i).HTTP.Get / GetRange / Head / List`. +4. Assert byte-exact body, status code, and (where relevant) origin + RPC counts via `CountingOrigin` (`opts.OriginOverride`) or peer + 409 counts via `CountingInternalHandlerWrap` + (`opts.InternalHandlerWrap`). + +## Future work + +Tracked in `doc.go` TODOs: + +- `TestEtagChange` (mid-fill mutation): requires a deterministic test + seam in `fetch.Coordinator` to pause between chunk fetches. +- Fault-injection origin / cachestore decorators: timeout, throttle, + 5xx retry-budget assertions. diff --git a/hack/orca/kind-config.yaml b/hack/orca/kind-config.yaml new file mode 100644 index 00000000..0f5b2d21 --- /dev/null +++ b/hack/orca/kind-config.yaml @@ -0,0 +1,22 @@ +# Kind cluster config for the Orca dev harness. +# +# 1 control-plane + 3 workers. The 3 workers match Orca's default +# replica count and the required pod-anti-affinity (hostname topology). +# +# extraPortMappings on the first worker exposes Azurite's NodePort +# (default 30100) to the host so the seeder tool (hack/cmd/orcaseed) +# can reach Azurite at http://localhost:30100/devstoreaccount1/ +# without a kubectl port-forward. NodePort services in Kind aren't +# routable from the host without explicit port mappings. +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: orca-dev +nodes: + - role: control-plane + - role: worker + extraPortMappings: + - containerPort: 30100 + hostPort: 30100 + protocol: TCP + - role: worker + - role: worker diff --git a/hack/orca/kind-create.sh b/hack/orca/kind-create.sh new file mode 100755 index 00000000..4b0300ab --- /dev/null +++ b/hack/orca/kind-create.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# kind-create.sh - create the Orca dev Kind cluster idempotently. +set -euo pipefail + +CLUSTER_NAME=${CLUSTER_NAME:?CLUSTER_NAME must be set} +KIND_CONFIG=${KIND_CONFIG:?KIND_CONFIG must be set} + +if ! command -v kind >/dev/null 2>&1; then + echo "kind is not installed. See https://kind.sigs.k8s.io/docs/user/quick-start/#installation" >&2 + exit 1 +fi + +if kind get clusters 2>/dev/null | grep -qx "${CLUSTER_NAME}"; then + echo "Kind cluster '${CLUSTER_NAME}' already exists; skipping creation." + exit 0 +fi + +echo "Creating Kind cluster '${CLUSTER_NAME}' from ${KIND_CONFIG} ..." +kind create cluster --name "${CLUSTER_NAME}" --config "${KIND_CONFIG}" --wait 120s + +echo "Cluster ready. Current context:" +kubectl config current-context diff --git a/hack/orca/kind-load.sh b/hack/orca/kind-load.sh new file mode 100755 index 00000000..c1b51d8d --- /dev/null +++ b/hack/orca/kind-load.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# kind-load.sh - sideload the Orca container image into the Kind nodes. +# +# Kind clusters can't pull from the local container engine's image +# store directly. This script saves the image to a tarball with the +# configured CONTAINER_ENGINE and feeds it to `kind load image-archive`. +set -euo pipefail + +CLUSTER_NAME=${CLUSTER_NAME:?CLUSTER_NAME must be set} +ORCA_IMAGE=${ORCA_IMAGE:?ORCA_IMAGE must be set} +CONTAINER_ENGINE=${CONTAINER_ENGINE:-podman} + +if ! command -v kind >/dev/null 2>&1; then + echo "kind is not installed." >&2 + exit 1 +fi + +tmpdir=$(mktemp -d) +trap 'rm -rf "${tmpdir}"' EXIT + +archive="${tmpdir}/orca.tar" +echo "Saving ${ORCA_IMAGE} to ${archive} via ${CONTAINER_ENGINE} ..." +"${CONTAINER_ENGINE}" save -o "${archive}" "${ORCA_IMAGE}" + +echo "Loading image into Kind cluster '${CLUSTER_NAME}' ..." +kind load image-archive "${archive}" --name "${CLUSTER_NAME}" + +echo "Image loaded." diff --git a/hack/orca/quickstart.md b/hack/orca/quickstart.md new file mode 100644 index 00000000..d3a7e38b --- /dev/null +++ b/hack/orca/quickstart.md @@ -0,0 +1,209 @@ + + +# Orca Dev Cluster Quickstart + +End-to-end recipe to stand up a local Kind cluster with Orca pointed +at an in-cluster Azurite origin and a LocalStack S3 cachestore, then +seed data and exercise the cache with debug-level traces. + +For the longer reference (every Make target, troubleshooting, +prerequisites, switching origin modes), see [dev-harness.md](./dev-harness.md). + +## Prerequisites + +- `kind`, `kubectl`, `podman` (or `docker`). +- `go` toolchain (used to build the orca image and run the + `hack/cmd/orcaseed` tool). + +## Step 1 - One-time setup + +Copy the example env file and edit it for Azurite-with-debug: + +```bash +cp hack/orca/.env.example hack/orca/.env +$EDITOR hack/orca/.env +``` + +Set: + +``` +ORIGIN_DRIVER=azureblob +ORIGIN_ID=azureblob-azurite +AZURE_CONTAINER=orca-test +LOG_LEVEL=debug +``` + +Leave `AZURE_STORAGE_ACCOUNT`, `AZURE_STORAGE_KEY`, and +`AZUREBLOB_ENDPOINT` blank - the harness auto-selects +`devstoreaccount1` + the well-known Azurite dev key + the in-cluster +Azurite Service URL. + +## Step 2 - Bring up the cluster + +```bash +make orca-up +``` + +Single command. Builds the orca image, creates the Kind cluster, +loads the image, deploys LocalStack + Azurite + Orca, waits until +all three Orca replicas are Ready. Orca pods start with +`logging.level: debug` so the per-chunk trace is live from the very +first request. + +Expected pods after bring-up: + +```bash +make -C hack/orca status +# azurite-... 1/1 Running +# localstack-... 1/1 Running +# orca-azurite-container-init-... 0/1 Completed +# orca-buckets-init-... 0/1 Completed +# orca-... 1/1 Running (x3) +``` + +## Step 3 - Seed the origin + +Azurite is exposed to the host via NodePort `30100` (Kind's +extraPortMapping forwards it to `localhost:30100`), so no +`kubectl port-forward` is needed for the seeder. + +```bash +# 5 x 10 MiB random blobs named synth-0 ... synth-4 +make -C hack/orca seed-generate SEED_ARGS='--size 10MiB --count 5' + +# Or a single 100 MiB blob named big-0 +make -C hack/orca seed-generate SEED_ARGS='--size 100MiB --count 1 --prefix big-' + +# Or upload a real file from disk +make -C hack/orca seed-upload FILE=~/data.tar.gz + +# Reproducible content (same --seed -> byte-identical blobs) +make -C hack/orca seed-generate SEED_ARGS='--size 10MiB --count 3 --seed 42' + +# Inspect / clean up +make -C hack/orca seed-list +make -C hack/orca seed-delete PREFIX=synth- SEED_ARGS='--yes' +``` + +Per-blob ceiling: 1 GiB unless `--force`. Cumulative-bytes warning at +1 GiB. The seeder uses chunked uploads, so very large blobs do not +buffer in host memory. + +## Step 4 - Port-forward the Orca edge + +In a separate terminal: + +```bash +make -C hack/orca port-forward +# Forwarding from 127.0.0.1:8443 -> 8443 +``` + +Leave this running. + +## Step 5 - Drive the cache + +```bash +# First hit: cold fill. Triggers origin GetRange, cachestore PutChunk. +curl -v http://localhost:8443/orca-test/synth-0 -o /dev/null + +# Second hit: warm cache. catalog hit -> cachestore_get_chunk. +curl -v http://localhost:8443/orca-test/synth-0 -o /dev/null +``` + +For the bigger blob, you can watch chunked streaming behaviour by +running the GET against `big-0` (12 chunks at the default 8 MiB +chunk size) and tailing the logs in parallel. + +## Step 6 - Watch the per-chunk debug trace + +```bash +# Filter to one bucket +make -C hack/orca logs | jq 'select(.chunk.bucket=="orca-test")' + +# Filter to one source file (e.g. just fetch coordinator decisions) +make -C hack/orca logs | jq 'select(.source.file | endswith("fetch.go"))' + +# Or just the firehose +make -C hack/orca logs +``` + +On a cold fill you should see a sequence like: + +``` +edge_request (server.EdgeHandler) +head_object (fetch.Coordinator) +metadata_singleflight_leader (metadata.Cache) +azureblob_head_request / _response (origin/azureblob) +metadata_record (metadata.Cache) +edge_get_plan (server.EdgeHandler) +get_chunk (fetch.Coordinator) +chunkcatalog_lookup_miss (chunkcatalog.Catalog) +cachestore_stat_result present:false (cachestore/s3) +coordinator_selected (cluster.Cluster) +fill_local_lead OR peer_fill_attempt (fetch.Coordinator) +origin_slot_acquired (fetch.Coordinator.runFill) +origin_get_range_attempt (fetch.fetchWithRetry) +azureblob_get_range_request / _response (origin/azureblob) +origin_body_received bytes=N (fetch.runFill) +cachestore_put_chunk -> _success (cachestore/s3) +commit_success (fetch.runFill) +chunkcatalog_record_insert (chunkcatalog.Catalog) +edge_get_complete (server.EdgeHandler) +``` + +On a warm hit only `chunkcatalog_lookup_hit` and +`cachestore_get_chunk` fire - no origin call, no commit. + +## Step 7 - Iterate + +```bash +# After editing Go source: +make orca-reset +# Rebuilds image, side-loads into Kind, rolling-restarts. ~30-60s. + +# After editing a manifest template or .env: +make -C hack/orca deploy # re-render + apply (idempotent) +make -C hack/orca reset # bounce to pick up new ConfigMap + +# Clear the cachestore between experiments (forces every chunk back +# to the cold-fill path on next GET): +kubectl --context kind-orca-dev -n unbounded-kube exec deploy/localstack -- \ + awslocal s3 rm s3://orca-cache --recursive + +# Clear the origin between experiments: +make -C hack/orca seed-delete SEED_ARGS='--yes' +``` + +## Step 8 - Tear down + +```bash +make orca-down +``` + +Deletes the Kind cluster (and everything in it). + +## Cheat-sheet of common helpers + +| Verb | Effect | +|---|---| +| `make orca-up` | Full bring-up (idempotent). | +| `make orca-reset` | Rebuild image + kind-load + rolling-restart Orca. | +| `make orca-down` | Delete the Kind cluster. | +| `make -C hack/orca status` | `kubectl get pods -o wide` in the namespace. | +| `make -C hack/orca logs` | Tail all Orca pods. | +| `make -C hack/orca port-forward` | localhost:8443 -> edge service. | +| `make -C hack/orca seed-generate SEED_ARGS='...'` | Synthetic content. | +| `make -C hack/orca seed-upload FILE=...` | Upload a real file. | +| `make -C hack/orca seed-list` | What's in the container. | +| `make -C hack/orca seed-delete [PREFIX=...]` | Remove blobs. | + +## Alternative: integration tests (no Kind cluster) + +If you don't need to inspect the K8s deployment shape, the Go-level +integration suite under `internal/orca/inttest/` covers chunked +fetch + dedup + peer fallback against testcontainers-managed +LocalStack + Azurite. Much faster, no Kind setup: + +```bash +make orca-inttest # ~15-20s, requires Docker +``` diff --git a/images/orca/Containerfile b/images/orca/Containerfile new file mode 100644 index 00000000..6a987546 --- /dev/null +++ b/images/orca/Containerfile @@ -0,0 +1,50 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Build stage +FROM --platform=$BUILDPLATFORM docker.io/library/golang:1.26.2-trixie AS builder + +RUN apt-get update && apt-get install -y \ + build-essential \ + make \ + gcc \ + git \ + ca-certificates \ + && apt-get clean + +ENV CGO_ENABLED=0 +ENV GOPATH=/go +ENV GOTOOLCHAIN=auto +ENV PATH=$PATH:/go/bin + +WORKDIR /src + +COPY go.mod go.sum ./ +RUN go mod download + +COPY ../../ . + +ARG TARGETOS +ARG TARGETARCH +ARG VERSION=dev +ARG GIT_COMMIT= +ARG BUILD_TIME= +RUN GOOS=${TARGETOS} GOARCH=${TARGETARCH} \ + make orca-build VERSION=${VERSION} ${GIT_COMMIT:+GIT_COMMIT=${GIT_COMMIT}} ${BUILD_TIME:+BUILD_TIME=${BUILD_TIME}} + +# Runtime stage +FROM ubuntu:noble + +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + ca-certificates \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN mkdir -p /unbounded/bin + +COPY --from=builder /src/bin/orca /unbounded/bin/orca + +ENV PATH="/unbounded/bin:${PATH}" + +WORKDIR /unbounded + +ENTRYPOINT ["/unbounded/bin/orca"] diff --git a/internal/orca/app/app.go b/internal/orca/app/app.go new file mode 100644 index 00000000..dcbdbdea --- /dev/null +++ b/internal/orca/app/app.go @@ -0,0 +1,572 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package app wires the Orca runtime: origin + cachestore + cluster + +// fetch coordinator + edge / internal HTTP listeners. +// +// Production callers (cmd/orca/orca/orca.go) drive this from a YAML +// config; integration tests (internal/orca/inttest) drive it from a +// programmatic *config.Config plus options that inject in-memory or +// counting decorators around the origin / cachestore. +package app + +import ( + "context" + "errors" + "fmt" + "log/slog" + "net" + "net/http" + "sync" + "time" + + "github.com/Azure/unbounded/internal/orca/cachestore" + cachestores3 "github.com/Azure/unbounded/internal/orca/cachestore/s3" + "github.com/Azure/unbounded/internal/orca/chunkcatalog" + "github.com/Azure/unbounded/internal/orca/cluster" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/fetch" + "github.com/Azure/unbounded/internal/orca/metadata" + "github.com/Azure/unbounded/internal/orca/origin" + "github.com/Azure/unbounded/internal/orca/origin/awss3" + "github.com/Azure/unbounded/internal/orca/origin/azureblob" + "github.com/Azure/unbounded/internal/orca/server" +) + +// App is a running Orca instance. +// +// Construct with Start; tear down with Shutdown. Start is non-blocking: +// the returned App's listeners are accepting connections (via +// net.Listen) before Start returns, so EdgeAddr / InternalAddr / OpsAddr +// are resolved (including any :0 ports) by the time the caller sees them. +type App struct { + // EdgeAddr is the resolved client-edge listen address (host:port). + // When the config requested ":0" the port is the OS-assigned one. + EdgeAddr string + + // InternalAddr is the resolved peer-RPC listen address (host:port). + InternalAddr string + + // OpsAddr is the resolved /healthz + /readyz listen address. + OpsAddr string + + // Cluster is exposed so tests can inspect peer state and call + // Coordinator/Self for assertions. Production callers should treat + // this as read-only. + Cluster *cluster.Cluster + + log *slog.Logger + edgeSrv *http.Server + internalSrv *http.Server + opsSrv *http.Server + wg sync.WaitGroup + errCh chan error + + // cachestoreReady is set true once the cachestore self-test has + // passed (or skipped via WithSkipCachestoreSelfTest). Gated by + // the /readyz endpoint. + cachestoreReady bool +} + +type options struct { + log *slog.Logger + clusterOpt cluster.Option + origin origin.Origin + cacheStore cachestore.CacheStore + skipCacheSelfTest bool + internalHandlerWrap func(http.Handler) http.Handler + edgeListener net.Listener + internalListener net.Listener + opsListener net.Listener +} + +// Option configures Start. +type Option func(*options) + +// WithLogger overrides the slog.Logger used for the App's output. If +// not provided, a JSON handler writing to stdout at LevelInfo is used. +func WithLogger(log *slog.Logger) Option { + return func(o *options) { o.log = log } +} + +// WithPeerSource replaces the cluster's entire peer-discovery +// mechanism. Intended for integration tests that need full control +// (e.g. per-replica peer sets with explicit ports). Only one such +// override is meaningful per App; subsequent calls overwrite. +func WithPeerSource(s cluster.PeerSource) Option { + return func(o *options) { + o.clusterOpt = cluster.WithPeerSource(s) + } +} + +// WithOrigin replaces the origin driver constructed from cfg. Tests use +// this to wire counting / fault-injecting decorators around a real +// awss3 or azureblob client. +func WithOrigin(or origin.Origin) Option { + return func(o *options) { o.origin = or } +} + +// WithCacheStore replaces the cachestore driver constructed from cfg. +// Tests use this to wire a counting / fault-injecting decorator around +// a real s3 client (or to use an in-memory implementation). +func WithCacheStore(cs cachestore.CacheStore) Option { + return func(o *options) { o.cacheStore = cs } +} + +// WithSkipCachestoreSelfTest disables the boot-time atomic-commit +// self-test. Useful only in tests that wire a cachestore decorator +// already known to honor If-None-Match: *. +func WithSkipCachestoreSelfTest() Option { + return func(o *options) { o.skipCacheSelfTest = true } +} + +// WithInternalHandlerWrap installs a decorator around the internal +// peer-RPC handler. The wrap function receives the production handler +// and returns one that the http.Server actually serves. Production +// passes nothing -> identity. Tests use this to count 409 responses +// per source IP for the not-coordinator fallback assertion. +func WithInternalHandlerWrap(wrap func(http.Handler) http.Handler) Option { + return func(o *options) { o.internalHandlerWrap = wrap } +} + +// WithEdgeListener supplies a pre-bound listener for the client-edge +// HTTP server, bypassing app.Start's own net.Listen call. +// +// TEST-ONLY: production callers must not use this option. It is +// exposed for integration tests (internal/orca/inttest) that allocate +// the listener before the app starts so peer sets can advertise the +// captured port from t=0 without a close-and-rebind race. Using it in +// production silently disables the cfg.Server.Listen address. +func WithEdgeListener(ln net.Listener) Option { + return func(o *options) { o.edgeListener = ln } +} + +// WithInternalListener supplies a pre-bound listener for the peer-RPC +// internal HTTP server. +// +// TEST-ONLY: see WithEdgeListener. +func WithInternalListener(ln net.Listener) Option { + return func(o *options) { o.internalListener = ln } +} + +// WithOpsListener supplies a pre-bound listener for the ops HTTP +// server (/healthz, /readyz). +// +// TEST-ONLY: see WithEdgeListener. +func WithOpsListener(ln net.Listener) Option { + return func(o *options) { o.opsListener = ln } +} + +// Start wires every dependency and begins serving on the configured +// listeners. It returns once all listeners are accepting connections +// (or returns the error that prevented startup). +// +// The returned App must be Shutdown by the caller; Start does not own +// the parent context's lifetime. +// +// Ordering note: cluster.New is called before any listener is bound. +// Peers can therefore attempt internal-fill RPCs against this replica +// before its listener is accepting; those connects fail and the +// requester falls back to local fill via fetch.Coordinator.GetChunk's +// peer-fallback path. This is transient (sub-second between cluster +// construction and listener bind) and harmless. +func Start(ctx context.Context, cfg *config.Config, opts ...Option) (*App, error) { + o := options{} + for _, opt := range opts { + opt(&o) + } + + log := o.log + if log == nil { + log = slog.Default() + } + + or, err := buildOrigin(ctx, cfg, o.origin, log) + if err != nil { + return nil, err + } + + cs, err := buildCacheStore(ctx, cfg, o.cacheStore, log) + if err != nil { + return nil, err + } + + cachestoreReady := false + + if o.skipCacheSelfTest { + // Caller has asserted the cachestore decorator honors + // If-None-Match: * (the in-memory store used by tests). + // Treat readiness as satisfied immediately. + cachestoreReady = true + } else { + if err := cs.SelfTestAtomicCommit(ctx); err != nil { + return nil, fmt.Errorf("cachestore self-test failed: %w", err) + } + + log.LogAttrs(ctx, slog.LevelInfo, "cachestore self-test passed") + + cachestoreReady = true + } + + clusterOpts := []cluster.Option{cluster.WithLogger(log)} + if o.clusterOpt != nil { + clusterOpts = append(clusterOpts, o.clusterOpt) + } + + cl, err := cluster.New(ctx, cfg.Cluster, clusterOpts...) + if err != nil { + return nil, fmt.Errorf("init cluster: %w", err) + } + + cat := chunkcatalog.New(cfg.ChunkCatalog.MaxEntries, log) + mc := metadata.NewCache(cfg.Metadata, log) + fc := fetch.NewCoordinator(or, cs, cl, cat, mc, cfg, log) + + edgeHandler := server.NewEdgeHandler(fc, cfg, log) + + var internalHandler http.Handler = server.NewInternalHandler(fc, cl, log) + if o.internalHandlerWrap != nil { + internalHandler = o.internalHandlerWrap(internalHandler) + } + + edgeLn := o.edgeListener + if edgeLn == nil { + ln, err := net.Listen("tcp", cfg.Server.Listen) + if err != nil { + cleanupStartFailure(cl, nil, nil) + + return nil, fmt.Errorf("edge listener bind %q: %w", cfg.Server.Listen, err) + } + + edgeLn = ln + } + + internalLn := o.internalListener + if internalLn == nil { + ln, err := net.Listen("tcp", cfg.Cluster.InternalListen) + if err != nil { + cleanupStartFailure(cl, edgeLn, nil) + + return nil, fmt.Errorf("internal listener bind %q: %w", cfg.Cluster.InternalListen, err) + } + + internalLn = ln + } + + opsLn := o.opsListener + if opsLn == nil { + ln, err := net.Listen("tcp", cfg.Server.OpsListen) + if err != nil { + cleanupStartFailure(cl, edgeLn, internalLn) + + return nil, fmt.Errorf("ops listener bind %q: %w", cfg.Server.OpsListen, err) + } + + opsLn = ln + } + + a := &App{ + EdgeAddr: edgeLn.Addr().String(), + InternalAddr: internalLn.Addr().String(), + OpsAddr: opsLn.Addr().String(), + Cluster: cl, + log: log, + edgeSrv: &http.Server{ + Handler: edgeHandler, + ReadHeaderTimeout: 10 * time.Second, + }, + internalSrv: &http.Server{ + Handler: internalHandler, + ReadHeaderTimeout: 10 * time.Second, + }, + errCh: make(chan error, 3), + cachestoreReady: cachestoreReady, + } + + a.opsSrv = &http.Server{ + Handler: newOpsHandler(a.isReady), + ReadHeaderTimeout: 5 * time.Second, + } + + a.wg.Add(1) + + go func() { + defer a.wg.Done() + + log.LogAttrs(ctx, slog.LevelInfo, "edge listener", + slog.String("addr", a.EdgeAddr), + ) + + if err := a.edgeSrv.Serve(edgeLn); err != nil && !errors.Is(err, http.ErrServerClosed) { + a.errCh <- fmt.Errorf("edge listener: %w", err) + } + }() + + a.wg.Add(1) + + go func() { + defer a.wg.Done() + + log.LogAttrs(ctx, slog.LevelInfo, "internal listener", + slog.String("addr", a.InternalAddr), + slog.Bool("tls_enabled", cfg.Cluster.InternalTLS.Enabled), + ) + + var lerr error + if cfg.Cluster.InternalTLS.Enabled { + lerr = a.internalSrv.ServeTLS(internalLn, + cfg.Cluster.InternalTLS.CertFile, + cfg.Cluster.InternalTLS.KeyFile, + ) + } else { + log.LogAttrs(ctx, slog.LevelWarn, "internal listener TLS DISABLED - unsafe for production", + slog.String("addr", a.InternalAddr), + ) + + lerr = a.internalSrv.Serve(internalLn) + } + + if lerr != nil && !errors.Is(lerr, http.ErrServerClosed) { + a.errCh <- fmt.Errorf("internal listener: %w", lerr) + } + }() + + a.wg.Add(1) + + go func() { + defer a.wg.Done() + + log.LogAttrs(ctx, slog.LevelInfo, "ops listener", + slog.String("addr", a.OpsAddr), + ) + + if err := a.opsSrv.Serve(opsLn); err != nil && !errors.Is(err, http.ErrServerClosed) { + a.errCh <- fmt.Errorf("ops listener: %w", err) + } + }() + + return a, nil +} + +// cleanupStartFailure unwinds partially-constructed Start state when +// a subsequent step (e.g. a later net.Listen) fails. Closes any +// listeners already bound and tells the cluster to stop its refresh +// goroutine within a bounded budget. +func cleanupStartFailure(cl *cluster.Cluster, listeners ...net.Listener) { + for _, ln := range listeners { + if ln == nil { + continue + } + + _ = ln.Close() //nolint:errcheck // best-effort close on bind failure + } + + closeCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + _ = cl.Close(closeCtx) //nolint:errcheck // best-effort cleanup on bind failure +} + +// newOpsHandler returns the http.Handler serving /healthz and +// /readyz for kubelet probes. /healthz is unconditional 200 +// (process-alive); /readyz returns 200 only when isReady reports +// true. isReady is injected so tests can drive the readiness +// signal independently of the surrounding App. +func newOpsHandler(isReady func() bool) http.Handler { + mux := http.NewServeMux() + mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok")) //nolint:errcheck // best-effort probe response + }) + mux.HandleFunc("/readyz", func(w http.ResponseWriter, _ *http.Request) { + if !isReady() { + w.WriteHeader(http.StatusServiceUnavailable) + _, _ = w.Write([]byte("not ready")) //nolint:errcheck // best-effort probe response + + return + } + + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ready")) //nolint:errcheck // best-effort probe response + }) + + return mux +} + +// isReady reports whether the app is ready to serve traffic. +// Both conditions must hold: +// - cachestore self-test passed (or skipped via the test option). +// - cluster has loaded an initial peer-set snapshot. +func (a *App) isReady() bool { + return a.cachestoreReady && a.Cluster.HasInitialSnapshot() +} + +// Wait blocks until either the parent context is canceled or one of +// the listeners exits unexpectedly. It returns the first listener +// error (if any) or nil if ctx was canceled. Wait is intended for +// the production "serve until SIGTERM" path; tests typically call +// Shutdown directly. +// +// Any listener errors that arrive concurrently with the wait-return +// (ctx-cancel branch or first-error branch) are drained and logged +// at Warn so they aren't silently discarded. Without this, a +// shutdown that overlaps with a listener failure - or a multi- +// listener crash where two listeners errored within the same tick - +// would lose all but the first error. +// +// Priority: when ctx is already canceled at the time Wait is called, +// the ctx-cancel branch is taken deterministically even if errCh +// also has buffered errors. Go's select non-determinism would +// otherwise flip the return value between nil and a buffered error +// on a tick race, contradicting the documented "nil if ctx was +// canceled" contract. The buffered errors are still logged via +// drainErrCh; only their effect on Wait's return value is +// suppressed in this specific overlap. +func (a *App) Wait(ctx context.Context) error { + // Non-blocking pre-check: if ctx is already canceled, take the + // shutdown branch without exposing the select-randomization + // race against any errors that may have arrived alongside the + // cancellation. See the function comment for rationale. + select { + case <-ctx.Done(): + a.drainErrCh(ctx, "listener error received during shutdown") + + return nil + default: + } + + select { + case <-ctx.Done(): + a.drainErrCh(ctx, "listener error received during shutdown") + + return nil + case err := <-a.errCh: + a.drainErrCh(ctx, "additional listener error after first") + + return err + } +} + +// drainErrCh non-blockingly consumes any remaining errors from +// a.errCh and logs them at Warn with the given message. Used by +// Wait on both return paths to ensure no listener error is silently +// dropped. +func (a *App) drainErrCh(ctx context.Context, msg string) { + for { + select { + case err := <-a.errCh: + a.log.LogAttrs(ctx, slog.LevelWarn, msg, + slog.Any("err", err), + ) + default: + return + } + } +} + +// Shutdown gracefully stops every listener and the cluster goroutine. +// It is safe to call multiple times; subsequent calls are no-ops. +func (a *App) Shutdown(ctx context.Context) error { + var firstErr error + + if err := a.edgeSrv.Shutdown(ctx); err != nil { + a.log.LogAttrs(ctx, slog.LevelWarn, "edge listener shutdown failed", + slog.Any("err", err), + ) + + firstErr = err + } + + if err := a.internalSrv.Shutdown(ctx); err != nil { + a.log.LogAttrs(ctx, slog.LevelWarn, "internal listener shutdown failed", + slog.Any("err", err), + ) + + if firstErr == nil { + firstErr = err + } + } + + if a.opsSrv != nil { + if err := a.opsSrv.Shutdown(ctx); err != nil { + a.log.LogAttrs(ctx, slog.LevelWarn, "ops listener shutdown failed", + slog.Any("err", err), + ) + + if firstErr == nil { + firstErr = err + } + } + } + + if err := a.Cluster.Close(ctx); err != nil { + a.log.LogAttrs(ctx, slog.LevelWarn, "cluster close did not finish before ctx deadline", + slog.Any("err", err), + ) + + if firstErr == nil { + firstErr = err + } + } + + a.wg.Wait() + + return firstErr +} + +func buildOrigin(ctx context.Context, cfg *config.Config, override origin.Origin, log *slog.Logger) (origin.Origin, error) { + if override != nil { + return override, nil + } + + switch cfg.Origin.Driver { + case "azureblob": + or, err := azureblob.New(cfg.Origin.Azureblob, log) + if err != nil { + return nil, fmt.Errorf("init origin/azureblob: %w", err) + } + + return or, nil + case "awss3": + or, err := awss3.New(ctx, awss3.Config{ + Endpoint: cfg.Origin.AWSS3.Endpoint, + Region: cfg.Origin.AWSS3.Region, + Bucket: cfg.Origin.AWSS3.Bucket, + AccessKey: cfg.Origin.AWSS3.AccessKey, + SecretKey: cfg.Origin.AWSS3.SecretKey, + UsePathStyle: cfg.Origin.AWSS3.UsePathStyle, + }, log) + if err != nil { + return nil, fmt.Errorf("init origin/awss3: %w", err) + } + + return or, nil + default: + return nil, fmt.Errorf("unsupported origin driver: %q", cfg.Origin.Driver) + } +} + +func buildCacheStore(ctx context.Context, cfg *config.Config, override cachestore.CacheStore, log *slog.Logger) (cachestore.CacheStore, error) { + if override != nil { + return override, nil + } + + switch cfg.Cachestore.Driver { + case "s3": + cs, err := cachestores3.New(ctx, cachestores3.Config{ + Endpoint: cfg.Cachestore.S3.Endpoint, + Bucket: cfg.Cachestore.S3.Bucket, + Region: cfg.Cachestore.S3.Region, + AccessKey: cfg.Cachestore.S3.AccessKey, + SecretKey: cfg.Cachestore.S3.SecretKey, + UsePathStyle: cfg.Cachestore.S3.UsePathStyle, + }, log) + if err != nil { + return nil, fmt.Errorf("init cachestore/s3: %w", err) + } + + return cs, nil + default: + return nil, fmt.Errorf("unsupported cachestore driver: %q", cfg.Cachestore.Driver) + } +} diff --git a/internal/orca/app/app_test.go b/internal/orca/app/app_test.go new file mode 100644 index 00000000..37cf9ff6 --- /dev/null +++ b/internal/orca/app/app_test.go @@ -0,0 +1,146 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package app + +import ( + "context" + "errors" + "log/slog" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" +) + +// TestOpsHandler_Healthz_AlwaysReturnsOK locks the contract that +// /healthz is process-liveness only: it returns 200 unconditionally, +// without consulting any readiness signal. Kubelet liveness probes +// must succeed even before the app has fully bootstrapped. +func TestOpsHandler_Healthz_AlwaysReturnsOK(t *testing.T) { + t.Parallel() + + // readyFn is set to always-false; healthz must still 200. + h := newOpsHandler(func() bool { return false }) + + req := httptest.NewRequest(http.MethodGet, "/healthz", nil) + rr := httptest.NewRecorder() + h.ServeHTTP(rr, req) + + if rr.Code != http.StatusOK { + t.Errorf("healthz status = %d, want %d", rr.Code, http.StatusOK) + } +} + +// TestOpsHandler_Readyz_NotReadyReturns503 verifies that /readyz +// surfaces 503 Service Unavailable while the readiness signal is +// false. Kubelet readiness probes use 503 to gate Service endpoint +// inclusion so traffic does not arrive until the app is ready. +func TestOpsHandler_Readyz_NotReadyReturns503(t *testing.T) { + t.Parallel() + + h := newOpsHandler(func() bool { return false }) + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + rr := httptest.NewRecorder() + h.ServeHTTP(rr, req) + + if rr.Code != http.StatusServiceUnavailable { + t.Errorf("readyz status = %d, want %d", rr.Code, http.StatusServiceUnavailable) + } +} + +// TestOpsHandler_Readyz_ReadyReturns200 verifies the readiness +// transition from 503 to 200 when the injected signal flips. This +// is the bootstrap path the app drives once the cachestore +// self-test has passed and the cluster has loaded its initial +// peer-set snapshot. +func TestOpsHandler_Readyz_ReadyReturns200(t *testing.T) { + t.Parallel() + + var ready atomic.Bool + + h := newOpsHandler(ready.Load) + + // Initial: not ready. + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + rr := httptest.NewRecorder() + h.ServeHTTP(rr, req) + + if rr.Code != http.StatusServiceUnavailable { + t.Fatalf("pre-ready readyz = %d, want %d", rr.Code, http.StatusServiceUnavailable) + } + // Flip readiness and re-probe. + ready.Store(true) + + req = httptest.NewRequest(http.MethodGet, "/readyz", nil) + rr = httptest.NewRecorder() + h.ServeHTTP(rr, req) + + if rr.Code != http.StatusOK { + t.Errorf("post-ready readyz = %d, want %d", rr.Code, http.StatusOK) + } +} + +// TestApp_IsReady_RequiresCachestoreReady locks the AND-gating +// behaviour of isReady. When cachestoreReady is false, isReady must +// short-circuit and return false without touching the Cluster +// pointer. Without that short-circuit a self-test failure that +// leaves Cluster nil would panic the /readyz handler. +func TestApp_IsReady_RequiresCachestoreReady(t *testing.T) { + t.Parallel() + + a := &App{cachestoreReady: false} + + defer func() { + if r := recover(); r != nil { + t.Fatalf("isReady panicked instead of short-circuiting on cachestoreReady=false: %v", r) + } + }() + + if a.isReady() { + t.Errorf("isReady = true with cachestoreReady=false") + } +} + +// TestApp_Wait_DrainsErrChOnCtxCancel verifies that listener errors +// arriving alongside a shutdown ctx are all logged rather than only +// the first being preserved. Pre-fills errCh with three errors, +// then cancels ctx; Wait should drain all three to the logger. +// +// Regression for M-4 / the earlier app.Wait drain work; the +// expanded drain helper now applies to both Wait return paths so a +// multi-listener crash within a tick doesn't lose errors. +func TestApp_Wait_DrainsErrChOnCtxCancel(t *testing.T) { + t.Parallel() + + var buf strings.Builder + + log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelWarn})) + + a := &App{ + log: log, + errCh: make(chan error, 4), + } + + a.errCh <- errors.New("edge boom") + + a.errCh <- errors.New("internal boom") + + a.errCh <- errors.New("ops boom") + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // ctx already cancelled when Wait starts + + if err := a.Wait(ctx); err != nil { + t.Errorf("Wait err = %v, want nil (ctx cancelled)", err) + } + + out := buf.String() + for _, want := range []string{"edge boom", "internal boom", "ops boom"} { + if !strings.Contains(out, want) { + t.Errorf("drained log missing %q; got %q", want, out) + } + } +} diff --git a/internal/orca/cachestore/cachestore.go b/internal/orca/cachestore/cachestore.go new file mode 100644 index 00000000..9b99f5df --- /dev/null +++ b/internal/orca/cachestore/cachestore.go @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package cachestore defines the in-DC chunk store interface and shared +// types. Concrete drivers live under cachestore//. +// +// All drivers must implement atomic commit (CAS-style PutChunk that +// rejects overwrites) so concurrent fills across replicas converge +// without clobbering each other; SelfTestAtomicCommit is run at boot +// to verify the backend honors the precondition. +package cachestore + +import ( + "context" + "errors" + "io" + "time" + + "github.com/Azure/unbounded/internal/orca/chunk" +) + +// CacheStore is where chunk bytes physically live. Source of truth for +// chunk presence; backed by an in-DC S3-like store in production and +// LocalStack in dev. +type CacheStore interface { + GetChunk(ctx context.Context, k chunk.Key, off, n int64) (io.ReadCloser, error) + PutChunk(ctx context.Context, k chunk.Key, size int64, r io.Reader) error + Stat(ctx context.Context, k chunk.Key) (Info, error) + Delete(ctx context.Context, k chunk.Key) error + SelfTestAtomicCommit(ctx context.Context) error +} + +// Info is the result of a successful Stat. +type Info struct { + Size int64 + Committed time.Time +} + +// Sentinel errors. Wrap with %w so callers use errors.Is. +var ( + ErrNotFound = errors.New("cachestore: not found") + ErrTransient = errors.New("cachestore: transient") + ErrAuth = errors.New("cachestore: auth") + ErrCommitLost = errors.New("cachestore: commit lost (no-clobber denied)") +) diff --git a/internal/orca/cachestore/s3/s3.go b/internal/orca/cachestore/s3/s3.go new file mode 100644 index 00000000..50e46668 --- /dev/null +++ b/internal/orca/cachestore/s3/s3.go @@ -0,0 +1,528 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package s3 is the cachestore driver for in-DC S3-compatible stores. +// In production this targets VAST or another S3-compatible object +// store; in dev it targets LocalStack. +// +// Atomic commit is implemented via PutObject + If-None-Match: * (s3 +// conditional writes). The boot SelfTestAtomicCommit verifies the +// backend honors the precondition; the boot versioning gate verifies +// the bucket is not versioned (since If-None-Match is not honored on +// versioned buckets). +package s3 + +import ( + "bytes" + "context" + "crypto/rand" + "encoding/hex" + "errors" + "fmt" + "io" + "log/slog" + "net/http" + + "github.com/aws/aws-sdk-go-v2/aws" + awshttp "github.com/aws/aws-sdk-go-v2/aws/transport/http" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/aws/smithy-go" + + "github.com/Azure/unbounded/internal/orca/cachestore" + "github.com/Azure/unbounded/internal/orca/chunk" +) + +// Driver implements cachestore.CacheStore against an S3-compatible +// endpoint. +type Driver struct { + client *s3.Client + bucket string + log *slog.Logger +} + +// Config is the s3-driver configuration. Mirrors config.CachestoreS3 +// but kept package-local so the driver can be unit-tested without +// importing the whole config package. +type Config struct { + Endpoint string + Bucket string + Region string + AccessKey string + SecretKey string + UsePathStyle bool +} + +// New constructs a Driver. The bucket-versioning gate is run here +// unconditionally: a versioned bucket silently breaks the no-clobber +// atomic-commit primitive (PutObject + If-None-Match: *) so the +// driver refuses to start against one. +// +// The log receives debug-level emissions for every chunk operation +// (Get, Put, Stat, Delete) and step-by-step boot trace from +// SelfTestAtomicCommit / versioningGate. Passing nil falls back to +// slog.Default(). +// +// SelfTestAtomicCommit is a separate step (called by main after New) +// to keep the constructor side-effect-light. +func New(ctx context.Context, cfg Config, log *slog.Logger) (*Driver, error) { + if cfg.Bucket == "" { + return nil, fmt.Errorf("cachestore/s3: bucket required") + } + + if cfg.Endpoint == "" { + return nil, fmt.Errorf("cachestore/s3: endpoint required") + } + + awsCfg, err := awsconfig.LoadDefaultConfig(ctx, + awsconfig.WithRegion(cfg.Region), + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( + cfg.AccessKey, cfg.SecretKey, "", + )), + // Opt out of CRC64NVME default introduced in aws-sdk-go-v2 + // 1.32. LocalStack 3.8 returns InvalidRequest for unknown + // algorithms; real AWS S3 still works either way. + awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired), + awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired), + ) + if err != nil { + return nil, fmt.Errorf("cachestore/s3: aws config: %w", err) + } + + client := s3.NewFromConfig(awsCfg, func(o *s3.Options) { + o.BaseEndpoint = aws.String(cfg.Endpoint) + o.UsePathStyle = cfg.UsePathStyle + }) + + if log == nil { + log = slog.Default() + } + + d := &Driver{ + client: client, + bucket: cfg.Bucket, + log: log, + } + + if err := d.versioningGate(ctx); err != nil { + return nil, err + } + + return d, nil +} + +// versioningGate refuses to start if the bucket has versioning enabled +// or suspended. If-None-Match: * is not honored against versioned +// buckets, which would silently break atomic commit's no-clobber +// guarantee. +func (d *Driver) versioningGate(ctx context.Context) error { + d.log.LogAttrs(ctx, slog.LevelDebug, "versioning_gate_probe", + slog.String("bucket", d.bucket), + ) + + out, err := d.client.GetBucketVersioning(ctx, &s3.GetBucketVersioningInput{ + Bucket: aws.String(d.bucket), + }) + if err != nil { + return fmt.Errorf("cachestore/s3: GetBucketVersioning failed: %w", err) + } + + d.log.LogAttrs(ctx, slog.LevelDebug, "versioning_gate_status", + slog.String("bucket", d.bucket), + slog.String("status", string(out.Status)), + ) + + return validateBucketVersioning(d.bucket, out.Status) +} + +// validateBucketVersioning returns an error if the bucket's versioning +// status is incompatible with cachestore/s3's atomic-commit primitive. +// Extracted as a pure function so unit tests can cover all branches +// (empty / Enabled / Suspended) without round-tripping to a real or +// emulated S3 backend. +func validateBucketVersioning(bucket string, status s3types.BucketVersioningStatus) error { + switch status { + case s3types.BucketVersioningStatusEnabled, s3types.BucketVersioningStatusSuspended: + return fmt.Errorf( + "cachestore/s3: bucket %s has versioning %s; If-None-Match: * is not "+ + "honored on versioned buckets and the atomic-commit primitive cannot "+ + "guarantee no-clobber; disable bucket versioning to use cachestore/s3", + bucket, status) + } + + return nil +} + +// SelfTestAtomicCommit verifies the backend honors PutObject + +// If-None-Match: *. +func (d *Driver) SelfTestAtomicCommit(ctx context.Context) error { + suffix, err := randHex(16) + if err != nil { + return fmt.Errorf("cachestore/s3 self-test: generate probe key: %w", err) + } + + probeKey := fmt.Sprintf("_orca-selftest/%s", suffix) + body := []byte("orca-selftest") + + d.log.LogAttrs(ctx, slog.LevelDebug, "selftest_first_put", + slog.String("bucket", d.bucket), + slog.String("probe_key", probeKey), + ) + + // First put: must succeed. + _, err = d.client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(probeKey), + Body: bytes.NewReader(body), + IfNoneMatch: aws.String("*"), + }) + if err != nil { + return fmt.Errorf("cachestore/s3 self-test: first put failed: %w", err) + } + + d.log.LogAttrs(ctx, slog.LevelDebug, "selftest_second_put_expecting_412", + slog.String("bucket", d.bucket), + slog.String("probe_key", probeKey), + ) + + // Second put: must fail with 412. + _, err = d.client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(probeKey), + Body: bytes.NewReader(body), + IfNoneMatch: aws.String("*"), + }) + if err == nil { + // Clean up before returning the failure. + _, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup + Bucket: aws.String(d.bucket), + Key: aws.String(probeKey), + }) + + return fmt.Errorf( + "cachestore/s3: backend does not honor If-None-Match: *; refusing to start " + + "(second concurrent put returned 200 instead of 412)") + } + + if !isPreconditionFailed(err) { + _, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup + Bucket: aws.String(d.bucket), + Key: aws.String(probeKey), + }) + + return fmt.Errorf("cachestore/s3 self-test: second put returned unexpected error "+ + "(want 412 PreconditionFailed): %w", err) + } + + d.log.LogAttrs(ctx, slog.LevelDebug, "selftest_second_put_rejected_412", + slog.String("bucket", d.bucket), + slog.String("probe_key", probeKey), + ) + + // Cleanup probe key. + _, _ = d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort selftest cleanup + Bucket: aws.String(d.bucket), + Key: aws.String(probeKey), + }) + + return nil +} + +// GetChunk fetches [off, off+n) of the chunk path from the bucket. +// +// Rejects n <= 0 with a sentinel ErrInvalidArgument: the wire-format +// boundary (cluster.DecodeChunkKey) already rejects object_size <= 0, +// so an in-process caller asking for a zero-length read is a logic +// bug. Forwarding the request would yield a malformed S3 Range +// header (bytes=0--1). +func (d *Driver) GetChunk(ctx context.Context, k chunk.Key, off, n int64) (io.ReadCloser, error) { + if n <= 0 { + return nil, fmt.Errorf("cachestore/s3 get: n must be > 0, got %d", n) + } + + if off < 0 { + return nil, fmt.Errorf("cachestore/s3 get: off must be >= 0, got %d", off) + } + + rng := fmt.Sprintf("bytes=%d-%d", off, off+n-1) + + d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_get_chunk", + csChunkAttrs(k), + slog.Int64("off", off), + slog.Int64("n", n), + ) + + out, err := d.client.GetObject(ctx, &s3.GetObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(k.Path()), + Range: aws.String(rng), + }) + if err != nil { + mapped := mapErr(err) + d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_get_chunk_err", + csChunkAttrs(k), + slog.Any("err", mapped), + ) + + return nil, mapped + } + + return out.Body, nil +} + +// PutChunk uploads the chunk via PutObject + If-None-Match: *. On +// 412 returns ErrCommitLost (loser of an atomic-commit race). +// +// Rejects size <= 0 with a sentinel error: a zero-byte chunk is +// never a legitimate fill result (the wire-format boundary already +// rejects object_size <= 0, and the smallest legitimate tail chunk +// is 1 byte), and uploading a zero-byte object would poison the +// path so later GetChunk(n=expected) reads return 0 bytes and break +// the streaming model. +func (d *Driver) PutChunk(ctx context.Context, k chunk.Key, size int64, r io.Reader) error { + if size <= 0 { + return fmt.Errorf("cachestore/s3 put: size must be > 0, got %d", size) + } + // AWS SDK v2 needs an io.ReadSeeker for unsigned-payload uploads + // (so it can rewind on signed-retry). If the caller already passed + // a seekable reader we hand it to the SDK directly; otherwise + // buffer the bytes ourselves as a fallback. + body, ok := r.(io.ReadSeeker) + if !ok { + buf, err := io.ReadAll(r) + if err != nil { + return fmt.Errorf("cachestore/s3 put: read body: %w", err) + } + // Validate the actual byte count against the caller's + // claimed size. + if int64(len(buf)) != size { + return fmt.Errorf("cachestore/s3 put: short body (got %d want %d)", len(buf), size) + } + + body = bytes.NewReader(buf) + } else { + // Seekable-path size validation: probe the reader's length + // via Seek(0, End), confirm it matches the declared size, + // then rewind to position 0 for the upload. Without this + // guard, a buggy caller passing a Reader of length M with + // size=N would either be rejected by S3 (ContentLength + // mismatch) or upload a truncated / overlong blob, + // depending on backend behaviour. The wire-format boundary + // already rejects size <= 0; this catches the size > 0 but + // mismatched-bytes case at the driver entry point. + end, err := body.Seek(0, io.SeekEnd) + if err != nil { + return fmt.Errorf("cachestore/s3 put: seek-end: %w", err) + } + + if end != size { + return fmt.Errorf("cachestore/s3 put: seekable reader length %d does not match size %d", end, size) + } + + if _, err := body.Seek(0, io.SeekStart); err != nil { + return fmt.Errorf("cachestore/s3 put: seek-rewind: %w", err) + } + } + + d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_put_chunk", + csChunkAttrs(k), + slog.Int64("size", size), + ) + + _, err := d.client.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(k.Path()), + Body: body, + ContentLength: aws.Int64(size), + IfNoneMatch: aws.String("*"), + }) + if err != nil { + if isPreconditionFailed(err) { + d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_put_commit_lost", + csChunkAttrs(k), + ) + + return cachestore.ErrCommitLost + } + + mapped := mapErr(err) + d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_put_err", + csChunkAttrs(k), + slog.Any("err", mapped), + ) + + return mapped + } + + d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_put_success", + csChunkAttrs(k), + slog.Int64("size", size), + ) + + return nil +} + +// Stat checks for chunk presence. +func (d *Driver) Stat(ctx context.Context, k chunk.Key) (cachestore.Info, error) { + out, err := d.client.HeadObject(ctx, &s3.HeadObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(k.Path()), + }) + if err != nil { + mapped := mapErr(err) + // ErrNotFound is the expected 'miss' result for Stat; logged + // at the same debug level as the hit path so cache-hit-rate + // diagnostics can count both. + d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_stat_result", + csChunkAttrs(k), + slog.Bool("present", false), + slog.Any("err", mapped), + ) + + return cachestore.Info{}, mapped + } + + info := cachestore.Info{} + if out.ContentLength != nil { + info.Size = *out.ContentLength + } + + if out.LastModified != nil { + info.Committed = *out.LastModified + } + + d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_stat_result", + csChunkAttrs(k), + slog.Bool("present", true), + slog.Int64("size", info.Size), + ) + + return info, nil +} + +// Delete removes the chunk; idempotent. +func (d *Driver) Delete(ctx context.Context, k chunk.Key) error { + d.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_delete", + csChunkAttrs(k), + ) + + _, err := d.client.DeleteObject(ctx, &s3.DeleteObjectInput{ + Bucket: aws.String(d.bucket), + Key: aws.String(k.Path()), + }) + if err != nil { + if isNotFound(err) { + return nil + } + + return mapErr(err) + } + + return nil +} + +// csChunkAttrs renders the chunk's identifying tuple as a slog +// group attribute matching the cross-package 'chunk' taxonomy used +// by fetch.Coordinator and chunkcatalog. Operator queries can grep +// on a single attribute path across the request lifecycle. +func csChunkAttrs(k chunk.Key) slog.Attr { + return slog.Group("chunk", + slog.String("origin_id", k.OriginID), + slog.String("bucket", k.Bucket), + slog.String("key", k.ObjectKey), + slog.Int64("index", k.Index), + ) +} + +func randHex(n int) (string, error) { + b := make([]byte, n) + if _, err := rand.Read(b); err != nil { + // crypto/rand failure is extraordinary on Linux. Surface it + // to the selftest caller rather than masking with a + // time-based fallback: a fallback could collide on parallel + // boots and silently fail the first-put precondition, and + // the underlying entropy / sandbox issue is operator- + // actionable in its own right. + return "", fmt.Errorf("cachestore/s3: rand.Read: %w", err) + } + + return hex.EncodeToString(b), nil +} + +// isPreconditionFailed reports whether err represents a 412 +// Precondition Failed response from S3. The atomic-commit primitive +// (PutObject + If-None-Match: *) returns 412 when the key already +// exists; the SelfTest path also expects 412 on the duplicate put. +// We use the HTTP status code carried on *awshttp.ResponseError +// rather than matching service error codes by string, since the +// code surface is version-dependent across SDK and backend +// implementations whereas the HTTP status code is part of the +// stable wire contract. +func isPreconditionFailed(err error) bool { + var respErr *awshttp.ResponseError + if errors.As(err, &respErr) && respErr.Response != nil { + return respErr.Response.StatusCode == http.StatusPreconditionFailed + } + + return false +} + +func isNotFound(err error) bool { + var nsk *s3types.NoSuchKey + if errors.As(err, &nsk) { + return true + } + + var nsb *s3types.NoSuchBucket + if errors.As(err, &nsb) { + return true + } + + var notFound *s3types.NotFound + if errors.As(err, ¬Found) { + return true + } + + var respErr *awshttp.ResponseError + if errors.As(err, &respErr) && respErr.Response != nil && + respErr.Response.StatusCode == http.StatusNotFound { + return true + } + + return false +} + +// mapErr normalises driver errors to the cachestore sentinel +// taxonomy. AccessDenied / Forbidden / Unauthorized are surfaced by +// the SDK with stable smithy.APIError codes so we keep that match +// path; everything else routes through HTTP status code on the +// underlying *awshttp.ResponseError. +func mapErr(err error) error { + if isNotFound(err) { + return cachestore.ErrNotFound + } + + var apiErr smithy.APIError + if errors.As(err, &apiErr) { + switch apiErr.ErrorCode() { + case "AccessDenied", "Unauthorized", "Forbidden", "InvalidAccessKeyId", "SignatureDoesNotMatch": + return cachestore.ErrAuth + } + } + + var respErr *awshttp.ResponseError + if errors.As(err, &respErr) && respErr.Response != nil { + status := respErr.Response.StatusCode + if status == http.StatusUnauthorized || status == http.StatusForbidden { + return cachestore.ErrAuth + } + + if status >= 500 && status < 600 { + return cachestore.ErrTransient + } + } + + return err +} diff --git a/internal/orca/cachestore/s3/s3_test.go b/internal/orca/cachestore/s3/s3_test.go new file mode 100644 index 00000000..95466acf --- /dev/null +++ b/internal/orca/cachestore/s3/s3_test.go @@ -0,0 +1,235 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package s3 + +import ( + "bytes" + "context" + "errors" + "net/http" + "testing" + + awshttp "github.com/aws/aws-sdk-go-v2/aws/transport/http" + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" + smithy "github.com/aws/smithy-go" + smithyhttp "github.com/aws/smithy-go/transport/http" + + "github.com/Azure/unbounded/internal/orca/cachestore" + "github.com/Azure/unbounded/internal/orca/chunk" +) + +// makeResponseErr builds an *awshttp.ResponseError wrapping the +// given HTTP status code. Mirrors how the AWS SDK surfaces service +// errors to callers: an *awshttp.ResponseError nesting a +// *smithyhttp.ResponseError that carries the HTTP response. +func makeResponseErr(status int, inner error) *awshttp.ResponseError { + return &awshttp.ResponseError{ + ResponseError: &smithyhttp.ResponseError{ + Response: &smithyhttp.Response{ + Response: &http.Response{StatusCode: status}, + }, + Err: inner, + }, + } +} + +// TestIsPreconditionFailed_FromHTTPStatus verifies that 412 alone +// signals precondition failure; other statuses (and errors lacking +// HTTP-response context) do not. The original implementation matched +// service error codes by string ("PreconditionFailed", +// "InvalidArgument", "ConditionalRequestConflict") plus substring +// "412" - fragile across SDK versions and backend implementations. +func TestIsPreconditionFailed_FromHTTPStatus(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want bool + }{ + {"412 ResponseError -> true", makeResponseErr(412, errors.New("precondition")), true}, + {"500 ResponseError -> false", makeResponseErr(500, errors.New("ise")), false}, + {"404 ResponseError -> false", makeResponseErr(404, errors.New("not found")), false}, + {"plain error -> false", errors.New("StatusCode: 412 something"), false}, + {"nil -> false", nil, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isPreconditionFailed(tt.err); got != tt.want { + t.Errorf("isPreconditionFailed = %v, want %v", got, tt.want) + } + }) + } +} + +// TestIsNotFound covers the typed-error and HTTP-status branches. +func TestIsNotFound(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want bool + }{ + {"NoSuchKey typed", &s3types.NoSuchKey{}, true}, + {"NoSuchBucket typed", &s3types.NoSuchBucket{}, true}, + {"NotFound typed", &s3types.NotFound{}, true}, + {"404 ResponseError", makeResponseErr(404, errors.New("not found")), true}, + {"500 ResponseError", makeResponseErr(500, errors.New("ise")), false}, + {"plain error", errors.New("random"), false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isNotFound(tt.err); got != tt.want { + t.Errorf("isNotFound = %v, want %v", got, tt.want) + } + }) + } +} + +// fakeAPIError implements smithy.APIError for testing the +// AccessDenied / Forbidden mapping path. +type fakeAPIError struct{ code string } + +func (e *fakeAPIError) Error() string { return e.code } +func (e *fakeAPIError) ErrorCode() string { return e.code } +func (e *fakeAPIError) ErrorMessage() string { return e.code } +func (e *fakeAPIError) ErrorFault() smithy.ErrorFault { return smithy.FaultUnknown } +func (e *fakeAPIError) HTTPStatusCode() int { return 0 } + +// TestMapErr covers the full mapping table: 404 / typed not-found +// -> ErrNotFound, AccessDenied APIError -> ErrAuth, 5xx -> +// ErrTransient, anything else passes through. +func TestMapErr(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want error + }{ + {"NoSuchKey -> ErrNotFound", &s3types.NoSuchKey{}, cachestore.ErrNotFound}, + {"404 ResponseError -> ErrNotFound", makeResponseErr(404, errors.New("nf")), cachestore.ErrNotFound}, + {"AccessDenied APIError -> ErrAuth", &fakeAPIError{code: "AccessDenied"}, cachestore.ErrAuth}, + {"InvalidAccessKeyId APIError -> ErrAuth", &fakeAPIError{code: "InvalidAccessKeyId"}, cachestore.ErrAuth}, + {"403 ResponseError -> ErrAuth", makeResponseErr(403, errors.New("denied")), cachestore.ErrAuth}, + {"401 ResponseError -> ErrAuth", makeResponseErr(401, errors.New("unauth")), cachestore.ErrAuth}, + {"500 ResponseError -> ErrTransient", makeResponseErr(500, errors.New("ise")), cachestore.ErrTransient}, + {"503 ResponseError -> ErrTransient", makeResponseErr(503, errors.New("unavail")), cachestore.ErrTransient}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := mapErr(tt.err) + if !errors.Is(got, tt.want) { + t.Errorf("mapErr = %v, want errors.Is(_, %v) true", got, tt.want) + } + }) + } +} + +// TestMapErr_PassthroughUnknown verifies that unrecognized errors +// pass through unchanged. +func TestMapErr_PassthroughUnknown(t *testing.T) { + t.Parallel() + + src := errors.New("unrecognized") + if got := mapErr(src); got != src { + t.Errorf("mapErr(unknown) = %v, want passthrough %v", got, src) + } +} + +// TestGetChunk_RejectsZeroN verifies that GetChunk refuses n <= 0. +// Forwarding such a request would produce a malformed S3 Range +// header (bytes=0--1) which the backend rejects with InvalidArgument. +// The wire-format boundary (cluster.DecodeChunkKey) already rejects +// object_size <= 0, so an in-process caller reaching this with n <= 0 +// is a logic bug we want surfaced as an explicit error. +// +// Regression for C-2. +func TestGetChunk_RejectsZeroN(t *testing.T) { + t.Parallel() + + d := &Driver{} + + tests := []struct { + name string + off int64 + n int64 + }{ + {"n zero", 0, 0}, + {"n negative", 0, -1}, + {"off negative", -1, 1024}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := d.GetChunk(context.Background(), chunkPathOnlyKey(), tt.off, tt.n) + if err == nil { + t.Errorf("GetChunk(off=%d, n=%d) returned nil; want error", tt.off, tt.n) + } + }) + } +} + +// TestPutChunk_RejectsZeroSize verifies that PutChunk refuses +// size <= 0. A zero-byte commit would poison the path with a +// 0-byte blob and subsequent GetChunk(n=expected) reads would +// either error or stream zero bytes. +// +// Regression for C-3. +func TestPutChunk_RejectsZeroSize(t *testing.T) { + t.Parallel() + + d := &Driver{} + + for _, size := range []int64{0, -1} { + if err := d.PutChunk(context.Background(), chunkPathOnlyKey(), size, nil); err == nil { + t.Errorf("PutChunk(size=%d) returned nil; want error", size) + } + } +} + +// chunkPathOnlyKey returns a minimal chunk.Key whose Path() can be +// computed; used by the GetChunk / PutChunk guard tests that error +// before any S3 round-trip. +func chunkPathOnlyKey() chunk.Key { + return chunk.Key{ + OriginID: "ox", + Bucket: "b", + ObjectKey: "o", + ETag: "e1", + ChunkSize: 1024, + Index: 0, + } +} + +// TestPutChunk_SeekableSizeMismatch verifies that PutChunk rejects +// a seekable reader whose actual length does not match the declared +// size. Without the seekable-path probe, a buggy caller passing a +// Reader of length M with size=N would either be rejected by S3 +// (ContentLength mismatch) or upload a wrong-sized blob. +// +// Regression for H-6. +func TestPutChunk_SeekableSizeMismatch(t *testing.T) { + t.Parallel() + + d := &Driver{} + + // Reader has 10 bytes, but caller claims 1024. PutChunk must + // fail at the seek-and-check probe before any RPC. + r := bytes.NewReader(make([]byte, 10)) + if err := d.PutChunk(context.Background(), chunkPathOnlyKey(), 1024, r); err == nil { + t.Errorf("PutChunk accepted seekable reader with size mismatch") + } + + // Reader has 100 bytes, caller claims 50: also a mismatch + // (caller would upload only 50, leaving 50 unread). + r = bytes.NewReader(make([]byte, 100)) + if err := d.PutChunk(context.Background(), chunkPathOnlyKey(), 50, r); err == nil { + t.Errorf("PutChunk accepted seekable reader longer than declared size") + } +} diff --git a/internal/orca/chunk/chunk.go b/internal/orca/chunk/chunk.go new file mode 100644 index 00000000..8a2eb3bd --- /dev/null +++ b/internal/orca/chunk/chunk.go @@ -0,0 +1,213 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package chunk implements the chunk model: ChunkKey, deterministic +// path encoding, and the range -> chunk-index iterator. +package chunk + +import ( + "crypto/sha256" + "encoding/binary" + "encoding/hex" + "fmt" + "hash" +) + +// Key is the immutable identifier for a chunk. +// +// Path encoding: +// +// LP(s) = LE64(uint64(len(s))) || s +// hashKey = sha256( +// LP(origin_id) || +// LP(bucket) || +// LP(key) || +// LP(etag) || +// LE64(chunk_size) +// ) +// path = "//" +type Key struct { + OriginID string + Bucket string + ObjectKey string + ETag string + ChunkSize int64 + Index int64 +} + +// Path returns the canonical on-store path for this ChunkKey. +func (k Key) Path() string { + h := sha256.New() + writeLP(h, k.OriginID) + writeLP(h, k.Bucket) + writeLP(h, k.ObjectKey) + writeLP(h, k.ETag) + + var sizeBuf [8]byte + binary.LittleEndian.PutUint64(sizeBuf[:], uint64(k.ChunkSize)) + h.Write(sizeBuf[:]) + sum := h.Sum(nil) + + return fmt.Sprintf("%s/%s/%d", k.OriginID, hex.EncodeToString(sum), k.Index) +} + +// Range returns the byte range [Off, Off+Len) within the origin +// object that this chunk corresponds to. +func (k Key) Range() (off, length int64) { + off = k.Index * k.ChunkSize + length = k.ChunkSize + + return off, length +} + +// ExpectedLen returns the authoritative number of bytes this chunk +// should contain given the object's total size. For non-tail chunks +// this is k.ChunkSize; for the tail chunk it is the remainder. If +// objectSize is zero or negative (unknown), returns k.ChunkSize. If +// the chunk is entirely past the end of the object, returns 0. +func (k Key) ExpectedLen(objectSize int64) int64 { + if objectSize <= 0 { + return k.ChunkSize + } + + off := k.Index * k.ChunkSize + if off >= objectSize { + return 0 + } + + remaining := objectSize - off + if remaining < k.ChunkSize { + return remaining + } + + return k.ChunkSize +} + +// String renders the key compactly for logging. +func (k Key) String() string { + if len(k.ETag) > 8 { + return fmt.Sprintf("ChunkKey{%s/%s/%s..@%d#%d}", + k.OriginID, k.Bucket, k.ObjectKey, k.Index, len(k.ETag)) + } + + return fmt.Sprintf("ChunkKey{%s/%s/%s@%d}", k.OriginID, k.Bucket, k.ObjectKey, k.Index) +} + +func writeLP(h hash.Hash, s string) { + var lenBuf [8]byte + binary.LittleEndian.PutUint64(lenBuf[:], uint64(len(s))) + h.Write(lenBuf[:]) + h.Write([]byte(s)) +} + +// IndexRange returns the inclusive [first, last] chunk indices that +// cover the byte range [start, end] of an object whose total size is +// objectSize. +// +// Inputs: +// - start, end: requested byte range (inclusive on both ends). +// Both must be >= 0 under normal use. +// - chunkSize: > 0; the configured chunk size. +// - objectSize: > 0 for any meaningful call. Empty-object callers +// should not invoke IndexRange; the server short-circuits to +// 200 + empty body upstream. +// +// Clamping behaviour: +// - end >= objectSize is clamped to objectSize - 1. +// - end < 0 is defensively clamped to 0 (returns first=0, last=0, +// meaning "chunk 0" - the caller must already have prevented +// reaching this branch in normal flow; the clamp is a guard +// against an arithmetic bug elsewhere, not a supported empty- +// range encoding). +// +// The function does not validate chunkSize > 0; a zero or negative +// chunkSize panics with a runtime division-by-zero. The config +// validation at startup (chunking.size minimum 1 MiB) guarantees +// this invariant in production. +func IndexRange(start, end, chunkSize, objectSize int64) (first, last int64) { + if end >= objectSize { + end = objectSize - 1 + } + + if end < 0 { + end = 0 + } + + first = start / chunkSize + last = end / chunkSize + + return first, last +} + +// Tier is one entry in the chunk-size policy: objects with size +// >= MinObjectSize use ChunkSize, unless a higher-threshold tier +// also matches (in which case the higher tier wins). +// +// Tiers form an ascending-threshold ladder that overrides a base +// chunk size for sufficiently large objects, letting operators +// trade per-chunk HTTP overhead against per-fill memory for big +// blobs without changing the storage layout. See SizeFor for the +// selection rule. +type Tier struct { + MinObjectSize int64 + ChunkSize int64 +} + +// SizeFor returns the chunk size to use for an object of objectSize +// bytes. tiers must be strictly ascending by MinObjectSize; callers +// are responsible for validating this at config load time. +// objectSize <= 0 (unknown) returns base unchanged so that callers +// without a HEAD-resolved size still get a valid chunk size. +// +// Selection rule: walk tiers in ascending threshold order and pick +// the last tier whose MinObjectSize <= objectSize. If no tier +// matches (objectSize is smaller than the smallest threshold, or +// tiers is empty), the base size is returned. Ties on a tier +// boundary are inclusive of the lower bound: an object of size +// exactly MinObjectSize uses that tier's ChunkSize. +func SizeFor(objectSize, base int64, tiers []Tier) int64 { + if objectSize <= 0 { + return base + } + + chosen := base + + for _, t := range tiers { + if t.MinObjectSize > objectSize { + // Tiers are sorted ascending; no later tier can match. + break + } + + chosen = t.ChunkSize + } + + return chosen +} + +// ChunkSlice returns the [off, len) within a single chunk that +// satisfies the original client byte range [start, end]. +// +// chunkIdx is the chunk index. chunkSize is the configured chunk size. +// objectSize is the total origin-object size (used to clamp the last +// chunk if it is partial). +func ChunkSlice(chunkIdx, chunkSize, start, end, objectSize int64) (off, length int64) { + chunkStart := chunkIdx * chunkSize + + chunkEnd := chunkStart + chunkSize - 1 + if chunkEnd >= objectSize { + chunkEnd = objectSize - 1 + } + + if start > chunkStart { + off = start - chunkStart + } + + sliceEnd := chunkEnd + if end < chunkEnd { + sliceEnd = end + } + + length = sliceEnd - chunkStart - off + 1 + + return off, length +} diff --git a/internal/orca/chunk/chunk_test.go b/internal/orca/chunk/chunk_test.go new file mode 100644 index 00000000..cfed7dcb --- /dev/null +++ b/internal/orca/chunk/chunk_test.go @@ -0,0 +1,379 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package chunk + +import ( + "strings" + "testing" +) + +// TestKey_ExpectedLen covers the per-chunk expected length given an +// object size: full chunks for non-tail, remainder for the tail, 0 for +// past-end, k.ChunkSize when objectSize is unknown (<= 0). +func TestKey_ExpectedLen(t *testing.T) { + t.Parallel() + + const cs = int64(1024) + + tests := []struct { + name string + k Key + objectSize int64 + want int64 + }{ + {"full chunk 0", Key{ChunkSize: cs, Index: 0}, 4096, cs}, + {"full chunk 2", Key{ChunkSize: cs, Index: 2}, 4096, cs}, + {"tail chunk partial", Key{ChunkSize: cs, Index: 3}, 3500, 3500 - 3072}, + {"chunk exactly fills object", Key{ChunkSize: cs, Index: 3}, 4096, cs}, + {"chunk past end returns 0", Key{ChunkSize: cs, Index: 5}, 3500, 0}, + {"objectSize 0 -> ChunkSize (unknown)", Key{ChunkSize: cs, Index: 0}, 0, cs}, + {"objectSize negative -> ChunkSize", Key{ChunkSize: cs, Index: 7}, -1, cs}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := tc.k.ExpectedLen(tc.objectSize) + if got != tc.want { + t.Errorf("ExpectedLen=%d want %d", got, tc.want) + } + }) + } +} + +// TestKey_Path_Deterministic verifies that the same inputs always +// produce the same path and that meaningful input differences +// (OriginID, Bucket, ObjectKey, ETag, ChunkSize, Index) produce +// distinct paths. The path encoding is part of orca's design +// contract: any change here invalidates previously cached chunks. +func TestKey_Path_Deterministic(t *testing.T) { + t.Parallel() + + base := Key{ + OriginID: "origin-a", + Bucket: "bucket", + ObjectKey: "key", + ETag: "etag1", + ChunkSize: 1024, + Index: 0, + } + // Same inputs -> same path. Compare two equally-constructed Keys + // (calling Path() on the same receiver tautologically passes). + dup := base + if base.Path() != dup.Path() { + t.Fatalf("Path() not deterministic for identical key") + } + + other := base + otherPath := other.Path() + + mutations := []struct { + name string + mut func(k *Key) + }{ + {"different origin", func(k *Key) { k.OriginID = "origin-b" }}, + {"different bucket", func(k *Key) { k.Bucket = "other-bucket" }}, + {"different key", func(k *Key) { k.ObjectKey = "other-key" }}, + {"different etag", func(k *Key) { k.ETag = "etag2" }}, + {"different chunk size", func(k *Key) { k.ChunkSize = 2048 }}, + {"different index", func(k *Key) { k.Index = 1 }}, + } + + for _, m := range mutations { + t.Run(m.name, func(t *testing.T) { + mutated := base + m.mut(&mutated) + + got := mutated.Path() + if got == otherPath { + t.Errorf("path collision after %s mutation: %q", m.name, got) + } + }) + } +} + +// TestKey_Path_Format asserts the documented path shape: +// "//". +func TestKey_Path_Format(t *testing.T) { + t.Parallel() + + k := Key{ + OriginID: "origin-a", + Bucket: "b", + ObjectKey: "k", + ETag: "e", + ChunkSize: 1024, + Index: 7, + } + + path := k.Path() + + parts := strings.Split(path, "/") + if len(parts) != 3 { + t.Fatalf("path %q has %d segments, want 3", path, len(parts)) + } + + if parts[0] != "origin-a" { + t.Errorf("origin segment=%q want %q", parts[0], "origin-a") + } + + if len(parts[1]) != 64 { + t.Errorf("hex segment len=%d want 64 (sha256)", len(parts[1])) + } + + for _, c := range parts[1] { + isDigit := c >= '0' && c <= '9' + isLowerHex := c >= 'a' && c <= 'f' + + if !isDigit && !isLowerHex { + t.Errorf("hex segment contains non-hex char %q", c) + break + } + } + + if parts[2] != "7" { + t.Errorf("index segment=%q want %q", parts[2], "7") + } +} + +// TestKey_Range verifies (off, length) = (Index*ChunkSize, ChunkSize). +func TestKey_Range(t *testing.T) { + t.Parallel() + + k := Key{ChunkSize: 1 << 20, Index: 3} + + off, length := k.Range() + if off != 3<<20 { + t.Errorf("off=%d want %d", off, 3<<20) + } + + if length != 1<<20 { + t.Errorf("length=%d want %d", length, 1<<20) + } +} + +// TestIndexRange covers the chunk-index span computed from a byte +// range plus the end clamping to objectSize. +func TestIndexRange(t *testing.T) { + t.Parallel() + + const chunkSize = int64(1024) + + tests := []struct { + name string + start, end int64 + objectSize int64 + wantFirst int64 + wantLast int64 + }{ + {"aligned full chunk", 0, 1023, 1024, 0, 0}, + {"aligned two chunks", 0, 2047, 4096, 0, 1}, + {"start mid-chunk, end mid-chunk same", 100, 500, 1024, 0, 0}, + {"start mid-chunk, end mid-next-chunk", 100, 1500, 4096, 0, 1}, + {"end clamped to objectSize", 0, 9999, 2048, 0, 1}, + {"single byte", 5, 5, 1024, 0, 0}, + {"last partial chunk", 1024, 1500, 1500, 1, 1}, + // Empty-object guard: end = -1 (objectSize == 0). Without + // the negative-end clamp Go's integer division floors to 0 + // but a subsequent negative-end could leak through other + // branches; defensive clamp here keeps last >= 0. + {"empty object end=-1 clamped to 0", 0, -1, 0, 0, 0}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + first, last := IndexRange(tt.start, tt.end, chunkSize, tt.objectSize) + if first != tt.wantFirst { + t.Errorf("first=%d want %d", first, tt.wantFirst) + } + + if last != tt.wantLast { + t.Errorf("last=%d want %d", last, tt.wantLast) + } + }) + } +} + +// TestChunkSlice covers the (off, length) within a single chunk that +// satisfies the original byte range. Critical for cross-chunk +// streamSlice copies. +func TestChunkSlice(t *testing.T) { + t.Parallel() + + const chunkSize = int64(1024) + + tests := []struct { + name string + chunkIdx int64 + start int64 + end int64 + objectSize int64 + wantOff int64 + wantLen int64 + }{ + {"entirely within chunk 0", 0, 100, 199, 4096, 100, 100}, + {"start at chunk 0 boundary", 0, 0, 99, 4096, 0, 100}, + {"end at chunk 0 boundary", 0, 0, 1023, 4096, 0, 1024}, + {"chunk 1, range covers full chunk", 1, 1024, 2047, 4096, 0, 1024}, + {"chunk spans range start", 1, 500, 1500, 4096, 0, 477}, // [1024..1500] + {"chunk spans range end", 1, 1500, 2500, 4096, 476, 548}, + {"last partial chunk", 3, 3000, 3500, 3500, 0, 428}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + off, length := ChunkSlice(tt.chunkIdx, chunkSize, tt.start, tt.end, tt.objectSize) + if off != tt.wantOff { + t.Errorf("off=%d want %d", off, tt.wantOff) + } + + if length != tt.wantLen { + t.Errorf("length=%d want %d", length, tt.wantLen) + } + }) + } +} + +// TestSizeFor covers the chunk-size tier ladder: base for objects +// below the first threshold (or unknown sizes), tier ChunkSize for +// objects at or above the corresponding MinObjectSize, and +// last-tier-wins resolution when multiple tiers match. +func TestSizeFor(t *testing.T) { + t.Parallel() + + const ( + base = int64(8 * 1024 * 1024) // 8 MiB + t1 = int64(64 * 1024 * 1024) // 64 MiB + t2 = int64(128 * 1024 * 1024) // 128 MiB + oneG = int64(1024 * 1024 * 1024) // 1 GiB + tenG = int64(10 * 1024 * 1024 * 1024) // 10 GiB + ) + + defaultTiers := []Tier{ + {MinObjectSize: oneG, ChunkSize: t1}, + {MinObjectSize: tenG, ChunkSize: t2}, + } + + tests := []struct { + name string + objectSize int64 + base int64 + tiers []Tier + want int64 + }{ + { + name: "empty tiers returns base", + objectSize: 100 << 20, + base: base, + tiers: nil, + want: base, + }, + { + name: "object below first threshold returns base", + objectSize: 512 << 20, + base: base, + tiers: defaultTiers, + want: base, + }, + { + name: "object exactly at first threshold uses first tier", + objectSize: oneG, + base: base, + tiers: defaultTiers, + want: t1, + }, + { + name: "object between tiers uses lower tier", + objectSize: oneG + (1 << 20), + base: base, + tiers: defaultTiers, + want: t1, + }, + { + name: "object exactly at second threshold uses second tier", + objectSize: tenG, + base: base, + tiers: defaultTiers, + want: t2, + }, + { + name: "huge object uses highest tier", + objectSize: 700 * 1024 * 1024 * 1024, + base: base, + tiers: defaultTiers, + want: t2, + }, + { + name: "zero objectSize (unknown) returns base", + objectSize: 0, + base: base, + tiers: defaultTiers, + want: base, + }, + { + name: "negative objectSize returns base", + objectSize: -1, + base: base, + tiers: defaultTiers, + want: base, + }, + { + name: "single tier above object", + objectSize: 500 << 20, + base: base, + tiers: []Tier{{MinObjectSize: oneG, ChunkSize: t1}}, + want: base, + }, + { + name: "single tier at object", + objectSize: oneG, + base: base, + tiers: []Tier{{MinObjectSize: oneG, ChunkSize: t1}}, + want: t1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := SizeFor(tt.objectSize, tt.base, tt.tiers) + if got != tt.want { + t.Errorf("SizeFor(%d, %d, %v)=%d want %d", + tt.objectSize, tt.base, tt.tiers, got, tt.want) + } + }) + } +} + +// TestKey_String covers both formatting branches (short ETag + long +// ETag). +func TestKey_String(t *testing.T) { + t.Parallel() + + short := Key{ + OriginID: "o", + Bucket: "b", + ObjectKey: "k", + ETag: "abc", + Index: 5, + } + if s := short.String(); !strings.Contains(s, "@5") { + t.Errorf("short ETag string=%q does not contain @5", s) + } + + long := Key{ + OriginID: "o", + Bucket: "b", + ObjectKey: "k", + ETag: "abcdefghi", // 9 chars > 8 + Index: 5, + } + + s := long.String() + if !strings.Contains(s, "..@") { + t.Errorf("long ETag string=%q does not contain truncation marker '..@'", s) + } + + if !strings.Contains(s, "#9") { + t.Errorf("long ETag string=%q does not contain length suffix '#9'", s) + } +} diff --git a/internal/orca/chunkcatalog/chunkcatalog.go b/internal/orca/chunkcatalog/chunkcatalog.go new file mode 100644 index 00000000..8b80c0bd --- /dev/null +++ b/internal/orca/chunkcatalog/chunkcatalog.go @@ -0,0 +1,168 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package chunkcatalog implements a bounded LRU recording chunks known +// to be present in the CacheStore. Pure hot-path optimization; +// CacheStore is the source of truth. +// +// The catalog is presence-only: it tracks whether a chunk's path is +// known to exist in the cachestore. No size or metadata is stored. +// chunk.Path encodes (origin_id, bucket, key, etag, chunk_size), so +// a path hit means the cachestore contains bytes for this exact +// version of this chunk - the path encoding IS the integrity +// statement, and a stale entry whose backing bytes have been deleted +// is self-healing (cachestore.GetChunk returns ErrNotFound, caller +// Forget()s the entry and falls through to the stat path). +package chunkcatalog + +import ( + "container/list" + "context" + "log/slog" + "sync" + + "github.com/Azure/unbounded/internal/orca/chunk" +) + +// Catalog is a bounded LRU keyed on chunk.Key.Path(). +type Catalog struct { + mu sync.Mutex + maxEntries int + ll *list.List + idx map[string]*list.Element + log *slog.Logger +} + +type entry struct { + path string +} + +// New constructs a Catalog. The log is used at debug level for +// per-call hit / miss / record / forget / evict trace lines via +// slog.LogAttrs so the cost when filtered out (operator runs at +// info or higher) is just the handler's level check. Passing nil +// falls back to slog.Default(). +func New(maxEntries int, log *slog.Logger) *Catalog { + if maxEntries <= 0 { + maxEntries = 100_000 + } + + if log == nil { + log = slog.Default() + } + + return &Catalog{ + maxEntries: maxEntries, + ll: list.New(), + idx: make(map[string]*list.Element, maxEntries), + log: log, + } +} + +// Lookup reports whether the chunk is known to be present in the +// cachestore. Bumps the LRU position on hit. +// +// This is the hottest log site in orca: it fires on every chunk read +// attempt. The LogAttrs path ensures attribute-evaluation cost is +// zero when the configured level is above Debug. +func (c *Catalog) Lookup(k chunk.Key) bool { + path := k.Path() + + c.mu.Lock() + defer c.mu.Unlock() + + el, ok := c.idx[path] + if !ok { + c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_lookup_miss", + catalogAttrs(k), + ) + + return false + } + + c.ll.MoveToFront(el) + + c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_lookup_hit", + catalogAttrs(k), + ) + + return true +} + +// Record marks the chunk as present. +// +// The catalog is presence-only: callers do not pass (and the catalog +// does not store) any size or freshness metadata. chunk.Path encodes +// (origin_id, bucket, key, etag, chunk_size), so a Recorded key is +// sufficient to know which exact version is in the cachestore. See +// the package docstring for the rationale. +func (c *Catalog) Record(k chunk.Key) { + path := k.Path() + + c.mu.Lock() + defer c.mu.Unlock() + + if el, ok := c.idx[path]; ok { + c.ll.MoveToFront(el) + + c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_record_update", + catalogAttrs(k), + ) + + return + } + + el := c.ll.PushFront(&entry{path: path}) + + c.idx[path] = el + + c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_record_insert", + catalogAttrs(k), + ) + + for c.ll.Len() > c.maxEntries { + oldest := c.ll.Back() + if oldest == nil { + break + } + + c.ll.Remove(oldest) + + oldEntry := oldest.Value.(*entry) //nolint:errcheck // type invariant: list elements are *entry + delete(c.idx, oldEntry.path) + + c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_evict", + slog.String("evicted_path", oldEntry.path), + slog.Int("lru_len", c.ll.Len()), + ) + } +} + +// Forget removes the entry if present. +func (c *Catalog) Forget(k chunk.Key) { + path := k.Path() + + c.mu.Lock() + defer c.mu.Unlock() + + if el, ok := c.idx[path]; ok { + c.ll.Remove(el) + delete(c.idx, path) + c.log.LogAttrs(context.Background(), slog.LevelDebug, "chunkcatalog_forget", + catalogAttrs(k), + ) + } +} + +// catalogAttrs renders the chunk's identifying tuple as a slog +// group attribute, matching the 'chunk' taxonomy used by +// fetch.Coordinator emissions so operator queries can grep on a +// single consistent attribute path across packages. +func catalogAttrs(k chunk.Key) slog.Attr { + return slog.Group("chunk", + slog.String("origin_id", k.OriginID), + slog.String("bucket", k.Bucket), + slog.String("key", k.ObjectKey), + slog.Int64("index", k.Index), + ) +} diff --git a/internal/orca/chunkcatalog/chunkcatalog_test.go b/internal/orca/chunkcatalog/chunkcatalog_test.go new file mode 100644 index 00000000..ea66893b --- /dev/null +++ b/internal/orca/chunkcatalog/chunkcatalog_test.go @@ -0,0 +1,141 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package chunkcatalog + +import ( + "bytes" + "io" + "log/slog" + "strings" + "testing" + + "github.com/Azure/unbounded/internal/orca/chunk" +) + +// TestNew_UsesInjectedLogger locks the contract that the catalog +// stores the caller's logger rather than slog.Default. +func TestNew_UsesInjectedLogger(t *testing.T) { + t.Parallel() + + injected := slog.New(slog.NewTextHandler(io.Discard, nil)) + c := New(16, injected) + + if c.log != injected { + t.Errorf("Catalog.log not the injected logger") + } +} + +// TestNew_NilLoggerFallsBackToDefault verifies the nil-logger +// fallback so misconfigured callers do not panic on the first +// trace emission. +func TestNew_NilLoggerFallsBackToDefault(t *testing.T) { + t.Parallel() + + c := New(16, nil) + if c.log == nil { + t.Errorf("nil logger should have fallen back to slog.Default()") + } +} + +// TestRecord_Lookup_Forget exercises the basic LRU operations +// against the presence-only API. +func TestRecord_Lookup_Forget(t *testing.T) { + t.Parallel() + + c := New(16, nil) + + k := chunk.Key{OriginID: "o", Bucket: "b", ObjectKey: "key", ChunkSize: 1024} + if c.Lookup(k) { + t.Fatalf("lookup before record returned hit") + } + + c.Record(k) + + if !c.Lookup(k) { + t.Errorf("lookup after record returned miss") + } + + c.Forget(k) + + if c.Lookup(k) { + t.Errorf("lookup after forget returned hit") + } +} + +// TestDebugEmissions verifies the catalog emits the standardized +// 'chunk' attribute group at debug level on the four operation +// classes (lookup hit, lookup miss, record insert, forget) and that +// the messages route through the injected logger. +func TestDebugEmissions(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + + log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug})) + c := New(16, log) + + k := chunk.Key{OriginID: "ox", Bucket: "bkt", ObjectKey: "obj", ChunkSize: 1024, Index: 4} + + c.Lookup(k) // miss + c.Record(k) + c.Lookup(k) // hit + c.Forget(k) + + out := buf.String() + for _, want := range []string{ + "chunkcatalog_lookup_miss", + "chunkcatalog_record_insert", + "chunkcatalog_lookup_hit", + "chunkcatalog_forget", + "chunk.index=4", + "chunk.key=obj", + } { + if !strings.Contains(out, want) { + t.Errorf("expected %q in debug output; got %q", want, out) + } + } +} + +// TestDebugFilteredAtInfo verifies the catalog emits nothing when +// the handler is configured above Debug, so the hot-path overhead +// at production levels is just the handler's level check. +func TestDebugFilteredAtInfo(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + + log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelInfo})) + c := New(16, log) + + k := chunk.Key{OriginID: "ox", Bucket: "b", ObjectKey: "o", ChunkSize: 1024} + c.Record(k) + c.Lookup(k) + c.Forget(k) + + if buf.Len() != 0 { + t.Errorf("debug emission leaked through Info-level handler: %q", buf.String()) + } +} + +// TestEvictEmitsAttr ensures the LRU-eviction debug emission fires +// when capacity is exceeded. Capacity 1 plus two distinct inserts +// forces an eviction observable via the evicted_path attribute. +func TestEvictEmitsAttr(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + + log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug})) + c := New(1, log) + + k1 := chunk.Key{OriginID: "o", Bucket: "b", ObjectKey: "a", ChunkSize: 1024} + k2 := chunk.Key{OriginID: "o", Bucket: "b", ObjectKey: "b", ChunkSize: 1024} + + c.Record(k1) + c.Record(k2) + + if !strings.Contains(buf.String(), "chunkcatalog_evict") { + t.Errorf("evict emission missing from output: %q", buf.String()) + } +} diff --git a/internal/orca/cluster/cluster.go b/internal/orca/cluster/cluster.go new file mode 100644 index 00000000..a4f240c0 --- /dev/null +++ b/internal/orca/cluster/cluster.go @@ -0,0 +1,718 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package cluster handles peer discovery and rendezvous-hash +// coordinator selection. +// +// Peer discovery: the headless Kubernetes Service backing the Orca +// Deployment publishes Pod IPs in its A-record. We poll DNS at +// cluster.membership_refresh interval (default 5s) and snapshot the +// peer set. +// +// Coordinator selection: rendezvous hashing on (peer_ip, ChunkKey) +// picks one coordinator per chunk across the cluster. +// +// Internal RPC: each replica runs an HTTP/2 client to dial peers' +// internal listeners (mTLS in production, plain in dev). The +// listener side is in the server/internal handler. +// +// # Test seams +// +// Production constructs a DNS-backed PeerSource implicitly from +// cfg.Cluster.Service + net.DefaultResolver. Tests substitute the +// entire mechanism with WithPeerSource (typically a mutable +// StaticPeerSource per replica). +package cluster + +import ( + "context" + "crypto/sha256" + "encoding/binary" + "errors" + "fmt" + "io" + "log/slog" + "net" + "net/http" + "net/url" + "strconv" + "sync/atomic" + "time" + + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/config" +) + +// Peer represents one replica in the current peer-set snapshot. +// +// In production every Peer has Port == 0 because pod IPs are +// addressed on the same internal-listener port across the +// Deployment. Integration tests with multiple replicas sharing +// 127.0.0.1 set Port to the per-replica OS-assigned port; in that +// mode FillFromPeer dials peer.IP:peer.Port instead of falling back +// to cfg.Cluster.InternalListen's port. +type Peer struct { + IP string + Port int // 0 = use cfg.Cluster.InternalListen's port (production) + Self bool // true when this Peer entry represents the local replica +} + +// Cluster manages peer discovery, rendezvous hashing, and the +// internal-RPC client. +type Cluster struct { + cfg config.Cluster + log *slog.Logger + + peers atomic.Pointer[[]Peer] + + httpClient *http.Client + source PeerSource + + // consecutiveRefreshErrors counts adjacent failed refresh attempts. + // Reset on any successful refresh. When the count exceeds + // maxStalePeerRefreshes the retained-previous fallback gives up + // and reverts to a self-only peer set. + consecutiveRefreshErrors atomic.Int64 + + cancelFn context.CancelFunc + done chan struct{} +} + +// maxStalePeerRefreshes is the number of consecutive refresh failures +// after which Cluster.refresh stops retaining the previous peer-set +// snapshot and falls back to [Self]. Bounds how long we route to +// dead peers if peer discovery is permanently broken. +const maxStalePeerRefreshes = 5 + +// resolver looks up the host names that back the headless Service. +// Production uses net.DefaultResolver. The interface is +// package-internal: production code does not customize it, and the +// DNS-backed peer source is the only implementation. +type resolver interface { + LookupHost(ctx context.Context, host string) ([]string, error) +} + +// PeerSource produces the current peer-set snapshot. The DNS-backed +// implementation queries the headless Service's A-record. Tests +// substitute a StaticPeerSource that returns a mutable list of peers +// with explicit Port values (so multiple replicas can share an IP). +// +// Each returned Peer.Self must be authoritatively set by the source +// (the source knows the calling replica's identity at construction +// time, so it is the only place that can stamp Self correctly when +// peers share an IP). +type PeerSource interface { + Peers(ctx context.Context) ([]Peer, error) +} + +// Option configures a Cluster at construction time. +type Option func(*Cluster) + +// WithPeerSource replaces the entire peer-discovery mechanism. This +// is the primary test seam; production code constructs the default +// DNS-backed source implicitly from cfg.Cluster.Service. +func WithPeerSource(s PeerSource) Option { + return func(c *Cluster) { c.source = s } +} + +// WithHTTPClient overrides the internal-RPC HTTP client. TEST-ONLY: +// production constructs the default client from cfg via newHTTPClient. +// Used by unit tests that need to inject a client with custom timeouts +// or transport behaviour for deterministic deadline coverage. +func WithHTTPClient(c *http.Client) Option { + return func(cl *Cluster) { cl.httpClient = c } +} + +// WithLogger overrides the cluster's structured logger. The default +// is slog.Default(). The logger receives debug-level emissions for +// every refresh cycle, coordinator selection, and FillFromPeer call, +// plus warn-level emissions for retained-previous-snapshot fallback. +func WithLogger(log *slog.Logger) Option { + return func(cl *Cluster) { cl.log = log } +} + +func newDNSPeerSource(service, selfIP string, r resolver) PeerSource { + if r == nil { + r = net.DefaultResolver + } + + return &dnsPeerSource{ + service: service, + selfIP: selfIP, + resolver: r, + } +} + +type dnsPeerSource struct { + service string + selfIP string + resolver resolver +} + +func (s *dnsPeerSource) Peers(ctx context.Context) ([]Peer, error) { + rctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + + ips, err := s.resolver.LookupHost(rctx, s.service) + if err != nil { + return nil, err + } + + peers := make([]Peer, 0, len(ips)) + for _, ip := range ips { + peers = append(peers, Peer{IP: ip, Self: ip == s.selfIP}) + } + + return peers, nil +} + +// New returns a Cluster and starts the membership-refresh goroutine. +func New(parent context.Context, cfg config.Cluster, opts ...Option) (*Cluster, error) { + if cfg.Service == "" { + return nil, fmt.Errorf("cluster: service required (headless Service FQDN)") + } + + if cfg.SelfPodIP == "" { + return nil, fmt.Errorf("cluster: self_pod_ip required (set POD_IP env)") + } + + ctx, cancel := context.WithCancel(parent) + + httpClient, err := newHTTPClient(cfg) + if err != nil { + cancel() + return nil, err + } + + c := &Cluster{ + cfg: cfg, + log: slog.Default(), + httpClient: httpClient, + source: newDNSPeerSource(cfg.Service, cfg.SelfPodIP, nil), + cancelFn: cancel, + done: make(chan struct{}), + } + + for _, opt := range opts { + opt(c) + } + + if c.log == nil { + c.log = slog.Default() + } + // Initial refresh; failure is non-fatal (empty peer-set fallback). + c.refresh(ctx) + + go c.refreshLoop(ctx) + + return c, nil +} + +// Close stops the refresh goroutine and waits for it to exit. If ctx +// is canceled before the goroutine exits (e.g. an in-flight DNS +// lookup is taking longer than the caller can tolerate) Close returns +// the context error. The underlying cancellation is always signalled, +// so the goroutine will exit eventually even if the caller stops +// waiting. +func (c *Cluster) Close(ctx context.Context) error { + c.cancelFn() + + select { + case <-c.done: + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +// Peers returns the current peer-set snapshot. +func (c *Cluster) Peers() []Peer { + p := c.peers.Load() + if p == nil { + return []Peer{{IP: c.cfg.SelfPodIP, Self: true}} + } + + return *p +} + +// HasInitialSnapshot reports whether the cluster has loaded at least +// one peer-set snapshot (success or failure path - any value stored +// by refresh counts). Used by the app's /readyz endpoint to gate +// readiness on cluster discovery having completed its initial pass. +// Returns false only during the bootstrap window before refresh +// runs even once. +func (c *Cluster) HasInitialSnapshot() bool { + return c.peers.Load() != nil +} + +// Coordinator selects the rendezvous-hashed coordinator for a chunk. +// +// Returns the Peer with the highest hash(peer || chunk_path) score. +// Peers() always returns at least one entry (self, via the bootstrap +// fallback in Peers and the never-empty post-condition of every +// branch in refresh), so this function does not need to handle an +// empty input. +func (c *Cluster) Coordinator(k chunk.Key) Peer { + peers := c.Peers() + + path := []byte(k.Path()) + + var ( + best Peer + bestScore uint64 + ) + + for i, p := range peers { + score := rendezvousScore(p, path) + if i == 0 || score > bestScore { + bestScore = score + best = p + } + } + + c.log.LogAttrs(context.Background(), slog.LevelDebug, "coordinator_selected", + slog.String("origin_id", k.OriginID), + slog.String("bucket", k.Bucket), + slog.String("key", k.ObjectKey), + slog.Int64("index", k.Index), + slog.String("chosen_ip", best.IP), + slog.Bool("is_self", best.Self), + slog.Uint64("score", bestScore), + ) + + return best +} + +// IsCoordinator reports whether this replica is the coordinator for k. +// Every code path producing a coord value stamps the Self flag +// authoritatively (dnsPeerSource matches by selfIP; StaticPeerSource +// by (selfIP, selfPort); the empty-peer-set fallback constructs +// c.self()), so checking Self is the single source of truth. +func (c *Cluster) IsCoordinator(k chunk.Key) bool { + return c.Coordinator(k).Self +} + +// FillFromPeer issues GET /internal/fill against the named peer and +// returns the streaming chunk body. Caller closes the returned +// reader. objectSize is the authoritative size of the object the +// chunk belongs to; it is forwarded to the peer so the leader can +// compute the correct per-chunk length (especially for the tail +// chunk) and set Content-Length on its response. +func (c *Cluster) FillFromPeer(ctx context.Context, p Peer, k chunk.Key, objectSize int64) (io.ReadCloser, error) { + if p.Self { + return nil, fmt.Errorf("cluster: refusing to FillFromPeer for self") + } + + scheme := "http" + if c.cfg.InternalTLS.Enabled { + scheme = "https" + } + + port := strconv.Itoa(p.Port) + if p.Port == 0 { + _, defaultPort, err := net.SplitHostPort(c.cfg.InternalListen) + if err != nil { + defaultPort = "8444" + } + + port = defaultPort + } + + target := url.URL{ + Scheme: scheme, + Host: net.JoinHostPort(p.IP, port), + Path: "/internal/fill", + RawQuery: encodeChunkKey(k, objectSize), + } + + c.log.LogAttrs(ctx, slog.LevelDebug, "fill_from_peer_request", + slog.String("peer_ip", p.IP), + slog.String("peer_port", port), + slog.String("origin_id", k.OriginID), + slog.String("bucket", k.Bucket), + slog.String("key", k.ObjectKey), + slog.Int64("index", k.Index), + slog.Int64("object_size", objectSize), + ) + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, target.String(), nil) + if err != nil { + return nil, fmt.Errorf("cluster: build internal-fill request: %w", err) + } + + req.Header.Set("X-Orca-Internal", "1") + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("cluster: internal-fill RPC: %w", err) + } + + if resp.StatusCode == http.StatusConflict { + _ = resp.Body.Close() //nolint:errcheck // best-effort close on error path + + c.log.LogAttrs(ctx, slog.LevelDebug, "fill_from_peer_not_coordinator", + slog.String("peer_ip", p.IP), + slog.String("origin_id", k.OriginID), + slog.Int64("index", k.Index), + ) + + return nil, ErrPeerNotCoordinator + } + + if resp.StatusCode/100 != 2 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) //nolint:errcheck // best-effort error body read + _ = resp.Body.Close() //nolint:errcheck // best-effort close on error path + + return nil, fmt.Errorf("cluster: internal-fill RPC returned %d: %s", + resp.StatusCode, string(body)) + } + + c.log.LogAttrs(ctx, slog.LevelDebug, "fill_from_peer_response", + slog.String("peer_ip", p.IP), + slog.Int("status", resp.StatusCode), + slog.Int64("content_length", resp.ContentLength), + ) + + // Wrap the response body in a defense-in-depth validator that + // ensures the peer delivered exactly Content-Length bytes. + // net/http already raises io.ErrUnexpectedEOF when the body + // closes short of an explicit Content-Length, but the wrapper + // makes that contract explicit at the call site (so readers of + // FillFromPeer do not need to reason about transport internals) + // and guards against future changes to net/http's behavior. + if resp.ContentLength > 0 { + return &validatingReader{ + rc: resp.Body, + expected: resp.ContentLength, + }, nil + } + + return resp.Body, nil +} + +// validatingReader wraps an io.ReadCloser and returns +// io.ErrUnexpectedEOF if the underlying stream closes after fewer +// than expected bytes. Used by FillFromPeer to detect truncated +// cross-replica internal-fill responses. +type validatingReader struct { + rc io.ReadCloser + expected int64 + got int64 +} + +func (r *validatingReader) Read(p []byte) (int, error) { + n, err := r.rc.Read(p) + r.got += int64(n) + + if errors.Is(err, io.EOF) && r.got != r.expected { + return n, fmt.Errorf("cluster: internal-fill truncated: got %d bytes, expected %d: %w", + r.got, r.expected, io.ErrUnexpectedEOF) + } + + return n, err +} + +func (r *validatingReader) Close() error { return r.rc.Close() } + +// ErrPeerNotCoordinator is returned by FillFromPeer when the peer +// reports it is not the coordinator (membership disagreement). +var ErrPeerNotCoordinator = fmt.Errorf("cluster: peer is not the coordinator (409 Conflict)") + +func (c *Cluster) refreshLoop(ctx context.Context) { + defer close(c.done) + + t := time.NewTicker(c.cfg.MembershipRefresh) + defer t.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-t.C: + c.refresh(ctx) + } + } +} + +func (c *Cluster) refresh(ctx context.Context) { + peers, err := c.source.Peers(ctx) + if err != nil { + // A cancelled parent ctx (process shutdown) is not a + // discovery failure: it means the refresh loop is exiting. + // Bumping the streak counter on the way out would push the + // final snapshot into the self-only fallback path and emit + // a noisy 'discovery failed' warning during normal + // shutdown. + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return + } + // Discovery failed. Retain the previous snapshot if we have + // one and we have not exceeded the staleness ceiling; the + // internal-fill RPC fallback (cluster.ErrPeerNotCoordinator + // -> local fill in fetch.Coordinator.GetChunk) absorbs + // pointing at briefly-stale peers. On bootstrap (no previous + // snapshot) or after too many consecutive errors, fall back + // to a self-only peer set so we keep making forward progress. + streak := c.consecutiveRefreshErrors.Add(1) + + if c.peers.Load() != nil && streak <= maxStalePeerRefreshes { + c.log.LogAttrs(ctx, slog.LevelWarn, "cluster: peer discovery failed; retaining previous snapshot", + slog.Any("err", err), + slog.Int64("consecutive_errors", streak), + ) + + return + } + + self := []Peer{{IP: c.cfg.SelfPodIP, Self: true}} + c.storePeerSet(ctx, self, "self_only_fallback") + + return + } + + c.consecutiveRefreshErrors.Store(0) + + if len(peers) == 0 { + // DNS legitimately reports no peers (e.g. headless Service + // has no Ready pods other than maybe self). Apply self-only + // fallback. + self := []Peer{{IP: c.cfg.SelfPodIP, Self: true}} + c.storePeerSet(ctx, self, "empty_discovery_self_only") + + return + } + // Ensure self is always in the set even if discovery hasn't + // caught up yet. + hasSelf := false + + for _, p := range peers { + if p.Self { + hasSelf = true + break + } + } + + if !hasSelf { + peers = append(peers, Peer{IP: c.cfg.SelfPodIP, Self: true}) + } + + c.storePeerSet(ctx, peers, "discovery_ok") +} + +// storePeerSet atomically swaps in a fresh peer-set snapshot and +// emits trace lines describing the transition. A per-cycle debug +// emission fires unconditionally; an info-level 'peer_set_changed' +// emission fires only when the rendered set differs from the +// previously stored snapshot. The reason argument tags the source +// of the new snapshot for diagnostic clarity. +func (c *Cluster) storePeerSet(ctx context.Context, peers []Peer, reason string) { + prev := c.peers.Load() + c.peers.Store(&peers) + + c.log.LogAttrs(ctx, slog.LevelDebug, "peer_set_refreshed", + slog.String("reason", reason), + slog.Int("count", len(peers)), + ) + + if prev == nil { + // First snapshot: log it at info so operators see the + // bootstrap transition. + c.log.LogAttrs(ctx, slog.LevelInfo, "peer_set_initial", + slog.String("reason", reason), + slog.Int("count", len(peers)), + ) + + return + } + + added, removed := diffPeers(*prev, peers) + if len(added) == 0 && len(removed) == 0 { + return + } + + c.log.LogAttrs(ctx, slog.LevelInfo, "peer_set_changed", + slog.String("reason", reason), + slog.Int("count", len(peers)), + slog.Any("added", added), + slog.Any("removed", removed), + ) +} + +// diffPeers returns the IP+Port lists added and removed between the +// previous and next snapshots. Self flag is ignored for diff purposes +// because membership identity is the (ip, port) tuple; the same peer +// flipping Self is a no-op for membership transitions. +func diffPeers(prev, next []Peer) (added, removed []string) { + seen := make(map[string]bool, len(prev)) + for _, p := range prev { + seen[peerKey(p)] = true + } + + nextSet := make(map[string]bool, len(next)) + for _, p := range next { + nextSet[peerKey(p)] = true + + if !seen[peerKey(p)] { + added = append(added, peerKey(p)) + } + } + + for _, p := range prev { + if !nextSet[peerKey(p)] { + removed = append(removed, peerKey(p)) + } + } + + return added, removed +} + +func peerKey(p Peer) string { + if p.Port == 0 { + return p.IP + } + + return fmt.Sprintf("%s:%d", p.IP, p.Port) +} + +func newHTTPClient(cfg config.Cluster) (*http.Client, error) { + // Guard: internal TLS configuration is not yet wired through to + // the transport. Refusing to start when cfg.InternalTLS.Enabled + // is true prevents a silent security downgrade in which the + // client would dial https:// against the system trust store + // instead of the configured CA / client cert. The production + // path (load CAFile + optional client cert/key into + // tr.TLSClientConfig) is not implemented; this guard must be + // removed in tandem with that work. + if cfg.InternalTLS.Enabled { + return nil, fmt.Errorf("cluster: internal TLS requested (cluster.internal_tls.enabled=true) but not yet implemented; refusing to start") + } + + // DialContext bounds connect-level latency independently of the + // caller's ctx. Without this, a stuck TCP SYN against a half- + // failed peer would hang until the caller's deadline (which can + // be the full 5-minute fill ctx for leader-side fills). 10s is + // generous for in-DC latency and short enough that a failed-fast + // peer fallback is visible. + dialer := &net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 30 * time.Second, + } + + tr := &http.Transport{ + DialContext: dialer.DialContext, + MaxIdleConns: 16, + MaxIdleConnsPerHost: 4, + IdleConnTimeout: 30 * time.Second, + // TLSHandshakeTimeout bounds the handshake separately from + // the request ctx so a malicious / misconfigured peer cannot + // hold a half-open TLS connection past the dial timeout. + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + ForceAttemptHTTP2: true, + } + + // No http.Client.Timeout: it is the request-total wall clock and + // would clamp long-running internal-fill body streams (an 8 MiB + // chunk on a degraded inter-pod link can exceed 60s). The caller's + // ctx (an edge request ctx for client-driven fills, the 5-minute + // detached fill ctx in fetch.runFill for leader-side ones) is the + // body-read deadline; the Transport-level Dial / TLS handshake + // timeouts above bound the connection-establishment surface + // independently. + return &http.Client{ + Transport: tr, + }, nil +} + +// Score returns the rendezvous-hash score for (peer, key). Exposed so +// integration tests can craft phantom peers that deterministically +// win or lose against a real peer for a given key (used to induce +// membership disagreement scenarios). +func Score(p Peer, key []byte) uint64 { + return rendezvousScore(p, key) +} + +func rendezvousScore(p Peer, key []byte) uint64 { + h := sha256.New() + h.Write([]byte(p.IP)) + h.Write([]byte{0}) + + if p.Port != 0 { + // In production every peer has Port=0 so this branch never + // fires and the score is identical to historical behavior + // (sha256(ip || 0 || key)). Tests with multiple peers sharing + // 127.0.0.1 set distinct Ports so the score differentiates + // replicas. + var pb [4]byte + binary.BigEndian.PutUint32(pb[:], uint32(p.Port)) + h.Write(pb[:]) + h.Write([]byte{0}) + } + + h.Write(key) + sum := h.Sum(nil) + + return binary.BigEndian.Uint64(sum[:8]) +} + +func encodeChunkKey(k chunk.Key, objectSize int64) string { + v := url.Values{} + v.Set("origin_id", k.OriginID) + v.Set("bucket", k.Bucket) + v.Set("key", k.ObjectKey) + v.Set("etag", k.ETag) + v.Set("chunk_size", strconv.FormatInt(k.ChunkSize, 10)) + v.Set("index", strconv.FormatInt(k.Index, 10)) + v.Set("object_size", strconv.FormatInt(objectSize, 10)) + + return v.Encode() +} + +// DecodeChunkKey parses query params into a Key plus the authoritative +// object size. Used by the internal listener (server/internal/fill). +func DecodeChunkKey(values url.Values) (chunk.Key, int64, error) { + chunkSize, err := strconv.ParseInt(values.Get("chunk_size"), 10, 64) + if err != nil { + return chunk.Key{}, 0, fmt.Errorf("invalid chunk_size: %w", err) + } + + if chunkSize <= 0 { + return chunk.Key{}, 0, fmt.Errorf("invalid chunk_size: must be > 0, got %d", chunkSize) + } + + idx, err := strconv.ParseInt(values.Get("index"), 10, 64) + if err != nil { + return chunk.Key{}, 0, fmt.Errorf("invalid index: %w", err) + } + + if idx < 0 { + return chunk.Key{}, 0, fmt.Errorf("invalid index: must be >= 0, got %d", idx) + } + + objectSize, err := strconv.ParseInt(values.Get("object_size"), 10, 64) + if err != nil { + return chunk.Key{}, 0, fmt.Errorf("invalid object_size: %w", err) + } + + if objectSize <= 0 { + return chunk.Key{}, 0, fmt.Errorf("invalid object_size: must be > 0, got %d", objectSize) + } + + originID := values.Get("origin_id") + bucket := values.Get("bucket") + key := values.Get("key") + etag := values.Get("etag") + + if originID == "" || key == "" { + return chunk.Key{}, 0, fmt.Errorf("missing required key fields") + } + + return chunk.Key{ + OriginID: originID, + Bucket: bucket, + ObjectKey: key, + ETag: etag, + ChunkSize: chunkSize, + Index: idx, + }, objectSize, nil +} diff --git a/internal/orca/cluster/cluster_test.go b/internal/orca/cluster/cluster_test.go new file mode 100644 index 00000000..431e848a --- /dev/null +++ b/internal/orca/cluster/cluster_test.go @@ -0,0 +1,763 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package cluster + +import ( + "bytes" + "context" + "errors" + "io" + "log/slog" + "net" + "net/http" + "net/url" + "strconv" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/config" +) + +// fakePeerSource implements PeerSource for unit tests. +type fakePeerSource struct { + mu func() ([]Peer, error) + calls atomic.Int64 +} + +func (f *fakePeerSource) Peers(_ context.Context) ([]Peer, error) { + f.calls.Add(1) + + return f.mu() +} + +// TestRefresh_RetainsPreviousOnError verifies that a discovery error +// after a successful refresh retains the previous peer-set rather +// than clobbering it with [Self]. +// +// Regression test for B3. +func TestRefresh_RetainsPreviousOnError(t *testing.T) { + t.Parallel() + + good := []Peer{ + {IP: "10.0.0.1", Self: false}, + {IP: "10.0.0.2", Self: true}, + {IP: "10.0.0.3", Self: false}, + } + + var failing atomic.Bool + + src := &fakePeerSource{ + mu: func() ([]Peer, error) { + if failing.Load() { + return nil, errors.New("transient DNS failure") + } + + out := make([]Peer, len(good)) + copy(out, good) + + return out, nil + }, + } + + c, err := New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.2", + MembershipRefresh: time.Hour, // disable auto-refresh; we drive it manually + }, + WithPeerSource(src), + ) + if err != nil { + t.Fatalf("New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + // Initial refresh ran during New; verify good peers are loaded. + if got := len(c.Peers()); got != 3 { + t.Fatalf("initial Peers()=%d want 3", got) + } + + failing.Store(true) + // First few error refreshes: retain previous snapshot. + for i := 0; i < maxStalePeerRefreshes; i++ { + c.refresh(t.Context()) + + if got := len(c.Peers()); got != 3 { + t.Errorf("after error %d: Peers()=%d want 3 (retain previous)", i+1, got) + } + } + // Next refresh exceeds the staleness ceiling -> fall back to self. + c.refresh(t.Context()) + + if got := c.Peers(); len(got) != 1 || !got[0].Self { + t.Errorf("after ceiling exceeded: Peers()=%+v want [Self]", got) + } + // Recovery: source returns good peers again. Error counter resets. + failing.Store(false) + c.refresh(t.Context()) + + if got := len(c.Peers()); got != 3 { + t.Errorf("after recovery: Peers()=%d want 3", got) + } + + if got := c.consecutiveRefreshErrors.Load(); got != 0 { + t.Errorf("error counter not reset after success: got %d", got) + } +} + +// TestRefresh_BootstrapErrorFallsBackToSelf verifies that on bootstrap +// (no previous snapshot) a discovery error falls back to [Self] +// immediately - we cannot retain something that does not exist. +func TestRefresh_BootstrapErrorFallsBackToSelf(t *testing.T) { + t.Parallel() + + src := &fakePeerSource{ + mu: func() ([]Peer, error) { + return nil, errors.New("DNS not reachable yet") + }, + } + + c, err := New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.1", + MembershipRefresh: time.Hour, + }, + WithPeerSource(src), + ) + if err != nil { + t.Fatalf("New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + got := c.Peers() + if len(got) != 1 || !got[0].Self { + t.Errorf("bootstrap with error source: Peers()=%+v want [Self]", got) + } +} + +// TestRefresh_EmptyResultFallsBackToSelf verifies that a successful +// discovery returning zero peers (the legitimate "I'm alone" answer) +// still falls back to [Self] without bumping the error counter. +func TestRefresh_EmptyResultFallsBackToSelf(t *testing.T) { + t.Parallel() + + src := &fakePeerSource{ + mu: func() ([]Peer, error) { + return nil, nil // no error, zero peers + }, + } + + c, err := New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.1", + MembershipRefresh: time.Hour, + }, + WithPeerSource(src), + ) + if err != nil { + t.Fatalf("New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + got := c.Peers() + if len(got) != 1 || !got[0].Self { + t.Errorf("empty source: Peers()=%+v want [Self]", got) + } + + if got := c.consecutiveRefreshErrors.Load(); got != 0 { + t.Errorf("empty (non-error) result should not bump error counter; got %d", got) + } +} + +// TestFillFromPeer_DetectsTruncation verifies that the validating +// reader returned by FillFromPeer surfaces io.ErrUnexpectedEOF when +// the peer advertises a Content-Length but the connection closes +// before that many bytes have been delivered. Without the validator +// the requester would observe a clean io.EOF and silently pass +// short bytes through to the client. +// +// Regression test for B7. +func TestFillFromPeer_DetectsTruncation(t *testing.T) { + t.Parallel() + + const advertised = 100 + + const delivered = 50 + + // Use a raw TCP listener so we have full control over the wire + // format: write Content-Length: 100, then write 50 body bytes, + // then close the connection mid-stream. + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + + t.Cleanup(func() { _ = ln.Close() }) //nolint:errcheck // test cleanup + + go func() { + conn, err := ln.Accept() + if err != nil { + return + } + + defer conn.Close() //nolint:errcheck // test cleanup + // Consume request headers up through the blank line. + buf := make([]byte, 4096) + + if _, err := conn.Read(buf); err != nil { + return + } + + resp := "HTTP/1.1 200 OK\r\n" + + "Content-Length: " + strconv.Itoa(advertised) + "\r\n" + + "Content-Type: application/octet-stream\r\n" + + "\r\n" + if _, err := conn.Write([]byte(resp)); err != nil { + return + } + + if _, err := conn.Write(make([]byte, delivered)); err != nil { + return + } + // Close mid-body without writing the remaining bytes. + }() + + host, portStr, err := net.SplitHostPort(ln.Addr().String()) + if err != nil { + t.Fatalf("split host port: %v", err) + } + + port, err := strconv.Atoi(portStr) + if err != nil { + t.Fatalf("parse port: %v", err) + } + + c, err := New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.1", + MembershipRefresh: time.Hour, + InternalListen: "0.0.0.0:8444", + }, + WithPeerSource(&fakePeerSource{mu: func() ([]Peer, error) { + return []Peer{{IP: "10.0.0.1", Self: true}}, nil + }}), + ) + if err != nil { + t.Fatalf("New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + peer := Peer{IP: host, Port: port} + key := chunk.Key{ + OriginID: "test-origin", + Bucket: "test-bucket", + ObjectKey: "test-object", + ETag: "test-etag", + ChunkSize: advertised, + Index: 0, + } + + body, err := c.FillFromPeer(t.Context(), peer, key, advertised) + if err != nil { + t.Fatalf("FillFromPeer: %v", err) + } + + defer body.Close() //nolint:errcheck // test cleanup + + got, err := io.ReadAll(body) + if !errors.Is(err, io.ErrUnexpectedEOF) { + t.Errorf("expected io.ErrUnexpectedEOF, got err=%v (read %d bytes)", err, len(got)) + } + + if len(got) != delivered { + t.Errorf("got %d bytes, expected %d (the delivered prefix)", len(got), delivered) + } +} + +// TestNewHTTPClient_NoWallTimeout asserts that the default +// internal-RPC HTTP client carries no Client.Timeout. Client.Timeout +// is a request-total wall clock that would clamp long-running fill +// body streams (an 8 MiB chunk on a degraded inter-pod link can +// exceed any reasonable hardcoded bound). The caller's ctx is the +// sole deadline for body reads. +func TestNewHTTPClient_NoWallTimeout(t *testing.T) { + t.Parallel() + + c, err := newHTTPClient(config.Cluster{}) + if err != nil { + t.Fatalf("newHTTPClient: %v", err) + } + + if c.Timeout != 0 { + t.Errorf("internal-RPC http.Client.Timeout = %v, want 0", c.Timeout) + } +} + +// TestNewHTTPClient_ConnectTimeouts asserts that the Transport +// carries bounded connect-level timeouts independent of the +// caller's ctx. Without these, a stuck TCP SYN or stalled TLS +// handshake against a half-failed peer would hang until the +// caller's deadline (which is the full 5-minute fill ctx for +// leader-side fills, causing slot starvation). +// +// Regression for H-4. +func TestNewHTTPClient_ConnectTimeouts(t *testing.T) { + t.Parallel() + + c, err := newHTTPClient(config.Cluster{}) + if err != nil { + t.Fatalf("newHTTPClient: %v", err) + } + + tr, ok := c.Transport.(*http.Transport) + if !ok { + t.Fatalf("Transport is %T; want *http.Transport", c.Transport) + } + + if tr.TLSHandshakeTimeout == 0 { + t.Errorf("TLSHandshakeTimeout is 0; want bounded") + } + + if tr.DialContext == nil { + t.Errorf("DialContext is nil; expected bounded dialer") + } +} + +// TestNewHTTPClient_InternalTLSEnabledRefusesToStart verifies that +// newHTTPClient refuses to construct a client when +// cfg.InternalTLS.Enabled=true. The TLS configuration is not yet +// wired into the transport (no TLSClientConfig); returning a working +// client in that case would silently dial https:// against the +// system trust store instead of the configured CA, downgrading the +// security posture. The constructor must fail loudly until the +// production TLS wiring is implemented. +func TestNewHTTPClient_InternalTLSEnabledRefusesToStart(t *testing.T) { + t.Parallel() + + cfg := config.Cluster{ + InternalTLS: config.InternalTLS{Enabled: true}, + } + + c, err := newHTTPClient(cfg) + if err == nil { + t.Fatalf("newHTTPClient with InternalTLS.Enabled=true returned client %v; want error", c) + } +} + +// TestFillFromPeer_CtxDeadlineHonored verifies that the caller's ctx +// deadline (rather than any hardcoded wall clock inside the cluster's +// HTTP client) is what bounds the cross-replica fill. Sets up a +// slow-paced TCP server that delivers a full Content-Length body +// over ~250ms, and calls FillFromPeer with a 50ms ctx; expects the +// read to fail with context.DeadlineExceeded. +// +// Companion to the wall-timeout removal: regression-tests that ctx +// propagation still bounds the request even though the +// Client.Timeout safety net is gone. +func TestFillFromPeer_CtxDeadlineHonored(t *testing.T) { + t.Parallel() + + const advertised = 1024 + + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + + t.Cleanup(func() { _ = ln.Close() }) //nolint:errcheck // test cleanup + + go func() { + conn, err := ln.Accept() + if err != nil { + return + } + + defer conn.Close() //nolint:errcheck // test cleanup + + buf := make([]byte, 4096) + if _, err := conn.Read(buf); err != nil { + return + } + + resp := "HTTP/1.1 200 OK\r\n" + + "Content-Length: " + strconv.Itoa(advertised) + "\r\n" + + "Content-Type: application/octet-stream\r\n" + + "\r\n" + if _, err := conn.Write([]byte(resp)); err != nil { + return + } + // Drip body bytes slowly: 64 bytes every 20ms (~ 320ms for + // the full 1 KiB), far exceeding the 50ms ctx deadline. + body := make([]byte, advertised) + + for i := 0; i < advertised; i += 64 { + end := i + 64 + if end > advertised { + end = advertised + } + + if _, err := conn.Write(body[i:end]); err != nil { + return + } + + time.Sleep(20 * time.Millisecond) + } + }() + + host, portStr, err := net.SplitHostPort(ln.Addr().String()) + if err != nil { + t.Fatalf("split host port: %v", err) + } + + port, err := strconv.Atoi(portStr) + if err != nil { + t.Fatalf("parse port: %v", err) + } + + c, err := New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.1", + MembershipRefresh: time.Hour, + InternalListen: "0.0.0.0:8444", + }, + WithPeerSource(&fakePeerSource{mu: func() ([]Peer, error) { + return []Peer{{IP: "10.0.0.1", Self: true}}, nil + }}), + ) + if err != nil { + t.Fatalf("New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + peer := Peer{IP: host, Port: port} + key := chunk.Key{ + OriginID: "test-origin", + Bucket: "test-bucket", + ObjectKey: "test-object", + ETag: "test-etag", + ChunkSize: advertised, + Index: 0, + } + + ctx, cancel := context.WithTimeout(t.Context(), 50*time.Millisecond) + defer cancel() + + body, err := c.FillFromPeer(ctx, peer, key, advertised) + if err != nil { + if !errors.Is(err, context.DeadlineExceeded) { + t.Fatalf("FillFromPeer err = %v, want context.DeadlineExceeded (or success then deadline on read)", err) + } + + return + } + + defer body.Close() //nolint:errcheck // test cleanup + + _, readErr := io.ReadAll(body) + if !errors.Is(readErr, context.DeadlineExceeded) { + t.Errorf("ReadAll err = %v, want context.DeadlineExceeded", readErr) + } +} + +// TestWithHTTPClient_Overrides verifies the test seam: tests can +// inject an alternate http.Client (used to give a deterministic +// short timeout or custom transport behaviour). +func TestWithHTTPClient_Overrides(t *testing.T) { + t.Parallel() + + custom := &http.Client{Timeout: 42 * time.Millisecond} + + c, err := New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.1", + MembershipRefresh: time.Hour, + }, + WithPeerSource(&fakePeerSource{mu: func() ([]Peer, error) { + return []Peer{{IP: "10.0.0.1", Self: true}}, nil + }}), + WithHTTPClient(custom), + ) + if err != nil { + t.Fatalf("New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + if c.httpClient != custom { + t.Errorf("httpClient not overridden by WithHTTPClient") + } +} + +// TestWithLogger_OverridesDefault verifies the cluster honours the +// injected slog.Logger so cluster.refresh's warn-level +// retain-snapshot message and the debug-level emissions route to +// the caller's configured handler rather than slog.Default. +func TestWithLogger_OverridesDefault(t *testing.T) { + t.Parallel() + + injected := slog.New(slog.NewTextHandler(io.Discard, nil)) + + c, err := New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.1", + MembershipRefresh: time.Hour, + }, + WithPeerSource(&fakePeerSource{mu: func() ([]Peer, error) { + return []Peer{{IP: "10.0.0.1", Self: true}}, nil + }}), + WithLogger(injected), + ) + if err != nil { + t.Fatalf("New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + if c.log != injected { + t.Errorf("Cluster.log not the injected logger") + } +} + +// TestRefresh_EmitsMembershipTransition verifies that a peer-set +// change (member added) surfaces a Info-level 'peer_set_changed' +// log line. Stable refreshes (no delta) must not re-emit this line. +func TestRefresh_EmitsMembershipTransition(t *testing.T) { + t.Parallel() + + initial := []Peer{ + {IP: "10.0.0.2", Self: true}, + } + + current := initial + + src := &fakePeerSource{ + mu: func() ([]Peer, error) { + out := make([]Peer, len(current)) + copy(out, current) + + return out, nil + }, + } + + var buf bytes.Buffer + + log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug})) + + c, err := New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.2", + MembershipRefresh: time.Hour, + }, + WithPeerSource(src), + WithLogger(log), + ) + if err != nil { + t.Fatalf("New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + // Initial snapshot landed during New: peer_set_initial emitted. + if !strings.Contains(buf.String(), "peer_set_initial") { + t.Errorf("expected peer_set_initial on bootstrap; got %q", buf.String()) + } + + buf.Reset() + + // Stable refresh: no delta -> only the debug peer_set_refreshed. + c.refresh(t.Context()) + + if strings.Contains(buf.String(), "peer_set_changed") { + t.Errorf("peer_set_changed should not fire when peer-set is stable; got %q", buf.String()) + } + + if !strings.Contains(buf.String(), "peer_set_refreshed") { + t.Errorf("expected per-cycle peer_set_refreshed; got %q", buf.String()) + } + + buf.Reset() + + // Add a peer: peer_set_changed must fire with the 'added' key. + current = append([]Peer{}, initial...) + current = append(current, Peer{IP: "10.0.0.3"}) + + c.refresh(t.Context()) + + if !strings.Contains(buf.String(), "peer_set_changed") { + t.Errorf("peer_set_changed missing on add; got %q", buf.String()) + } + + if !strings.Contains(buf.String(), "10.0.0.3") { + t.Errorf("added peer IP missing from log; got %q", buf.String()) + } +} + +// TestCoordinator_EmitsDebugSelection verifies the per-call debug +// emission carrying the chosen-peer and rendezvous score for a +// chunk. Operators rely on this to diagnose routing surprises. +func TestCoordinator_EmitsDebugSelection(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + + log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug})) + + c, err := New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.1", + MembershipRefresh: time.Hour, + }, + WithPeerSource(&fakePeerSource{mu: func() ([]Peer, error) { + return []Peer{{IP: "10.0.0.1", Self: true}}, nil + }}), + WithLogger(log), + ) + if err != nil { + t.Fatalf("New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + buf.Reset() + + c.Coordinator(chunk.Key{ + OriginID: "ox", Bucket: "b", ObjectKey: "o", ChunkSize: 1024, Index: 5, + }) + + out := buf.String() + for _, want := range []string{"coordinator_selected", "chosen_ip=10.0.0.1", "is_self=true", "index=5"} { + if !strings.Contains(out, want) { + t.Errorf("expected %q in coord debug output; got %q", want, out) + } + } +} + +// TestRefresh_CtxCanceledDoesNotBumpErrorCounter verifies that a +// refresh call whose ctx has been cancelled (the normal shutdown +// path) does not bump consecutiveRefreshErrors or churn the stored +// peer-set into the self-only fallback. Without this guard the +// final refresh during graceful shutdown produces a 'discovery +// failed' warning and pushes the membership into the self-only +// path even though nothing has actually gone wrong. +func TestRefresh_CtxCanceledDoesNotBumpErrorCounter(t *testing.T) { + t.Parallel() + + good := []Peer{ + {IP: "10.0.0.1", Self: false}, + {IP: "10.0.0.2", Self: true}, + } + + var failWithCancel atomic.Bool + + src := &fakePeerSource{ + mu: func() ([]Peer, error) { + if failWithCancel.Load() { + return nil, context.Canceled + } + + out := make([]Peer, len(good)) + copy(out, good) + + return out, nil + }, + } + + c, err := New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.2", + MembershipRefresh: time.Hour, // disable auto-refresh; drive manually. + }, + WithPeerSource(src), + ) + if err != nil { + t.Fatalf("New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + if got := c.consecutiveRefreshErrors.Load(); got != 0 { + t.Fatalf("pre-test error counter = %d, want 0", got) + } + + initialPeers := len(c.Peers()) + + failWithCancel.Store(true) + c.refresh(t.Context()) + + if got := c.consecutiveRefreshErrors.Load(); got != 0 { + t.Errorf("counter bumped on ctx.Canceled; got %d want 0", got) + } + + if got := len(c.Peers()); got != initialPeers { + t.Errorf("peer-set churned on ctx.Canceled; got %d want %d", got, initialPeers) + } +} + +// TestDecodeChunkKey_RejectsZeroObjectSize verifies that the wire +// boundary rejects object_size == 0 as well as negative values. +// The previous code accepted 0 as a sentinel for "unknown size" +// which became a foot-gun (validation skipped, malformed range, +// validating-reader bypassed); production callers always know the +// size from a prior Head, so tightening the contract removes the +// foot-gun without breaking any real caller. +// +// Regression for C-2 / C-3 / C-4. +func TestDecodeChunkKey_RejectsZeroObjectSize(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + objectSize string + wantErr bool + }{ + {"zero rejected", "0", true}, + {"negative rejected", "-1", true}, + {"positive accepted", "1024", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + v := url.Values{} + v.Set("origin_id", "ox") + v.Set("bucket", "b") + v.Set("key", "o") + v.Set("etag", "e1") + v.Set("chunk_size", "1024") + v.Set("index", "0") + v.Set("object_size", tt.objectSize) + + _, _, err := DecodeChunkKey(v) + if tt.wantErr { + if err == nil { + t.Errorf("DecodeChunkKey(object_size=%s) returned nil; want error", tt.objectSize) + } else if !strings.Contains(err.Error(), "object_size") { + t.Errorf("error does not mention object_size: %v", err) + } + + return + } + + if err != nil { + t.Errorf("DecodeChunkKey(object_size=%s) unexpected error: %v", tt.objectSize, err) + } + }) + } +} diff --git a/internal/orca/config/config.go b/internal/orca/config/config.go new file mode 100644 index 00000000..dd86e855 --- /dev/null +++ b/internal/orca/config/config.go @@ -0,0 +1,544 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package config defines Orca's YAML configuration shape and loading +// helpers. +// +// The schema is an intentional subset of the full Orca configuration +// surface; extending it later is a matter of adding fields and keeping +// zero-values backward-compatible. +package config + +import ( + "fmt" + "log/slog" + "os" + "strings" + "time" + + "gopkg.in/yaml.v3" + + "github.com/Azure/unbounded/internal/orca/chunk" +) + +// Config is the top-level Orca configuration. +type Config struct { + Server Server `yaml:"server"` + Origin Origin `yaml:"origin"` + Cachestore Cachestore `yaml:"cachestore"` + Cluster Cluster `yaml:"cluster"` + ChunkCatalog ChunkCatalog `yaml:"chunk_catalog"` + Metadata Metadata `yaml:"metadata"` + Chunking Chunking `yaml:"chunking"` + Logging Logging `yaml:"logging"` +} + +// Logging governs structured-log output. The level controls slog +// emission filtering; debug surfaces per-request and per-chunk +// tracing through the fetch coordinator, metadata cache, chunk +// catalog, cluster, cachestore, and origin drivers. +// +// The ORCA_LOG_LEVEL environment variable, if set and non-empty, +// overrides the YAML-configured Level at process startup. Useful +// for one-shot debug sessions without re-rendering the configmap. +type Logging struct { + // Level is one of "debug", "info", "warn", "error". Empty + // defaults to "info". + Level string `yaml:"level"` +} + +// Server holds the client-edge listener configuration plus the +// ops listener used for kubelet probes (/healthz and /readyz). +type Server struct { + Listen string `yaml:"listen"` + Auth ServerAuth `yaml:"auth"` + + // OpsListen is the bind address for the operations endpoint + // hosting /healthz and /readyz. Plain HTTP, no auth. Kubelet + // liveness and readiness probes target this address; production + // Service objects do not forward this port externally. + OpsListen string `yaml:"ops_listen"` +} + +// ServerAuth governs the client-edge authentication path. +// +// Production: enabled=true with mode=bearer or mode=mtls. +// Dev: enabled=false disables authentication entirely (no token +// or client cert required). This is a single security knob, not a +// dev_mode flag. +type ServerAuth struct { + Enabled bool `yaml:"enabled"` + Mode string `yaml:"mode"` + BearerSecretFile string `yaml:"bearer_secret_file"` +} + +// Origin describes the upstream origin (Azure Blob or AWS S3 in v1). +type Origin struct { + ID string `yaml:"id"` + Driver string `yaml:"driver"` // "azureblob" or "awss3" + TargetGlobal int `yaml:"target_global"` + QueueTimeout time.Duration `yaml:"queue_timeout"` + Retry OriginRetry `yaml:"retry"` + Azureblob Azureblob `yaml:"azureblob"` + AWSS3 AWSS3 `yaml:"awss3"` +} + +// OriginRetry captures the leader-side pre-header retry budget. +type OriginRetry struct { + Attempts int `yaml:"attempts"` + BackoffInitial time.Duration `yaml:"backoff_initial"` + BackoffMax time.Duration `yaml:"backoff_max"` + MaxTotalDuration time.Duration `yaml:"max_total_duration"` +} + +// Azureblob is the azureblob origin adapter configuration. +// +// Page and Append blobs are unconditionally rejected at Head: their +// random-access mutation model is incompatible with the chunked, +// immutable cache contract orca relies on. There is no configuration +// switch for this behaviour. +type Azureblob struct { + Account string `yaml:"account"` + AccountKey string `yaml:"account_key"` + Container string `yaml:"container"` + + // Endpoint, when set, overrides the default Azure Blob service URL + // (https://.blob.core.windows.net/). Used in dev to point + // at Azurite (http://azurite:10000/devstoreaccount1) so the + // azureblob driver path can be exercised without a real Azure + // account. + Endpoint string `yaml:"endpoint"` +} + +// AWSS3 is the awss3 origin adapter configuration. In dev this points +// at LocalStack alongside the cachestore (different bucket); in +// production it points at real AWS S3 with no Endpoint override. +type AWSS3 struct { + Endpoint string `yaml:"endpoint"` // empty for real AWS S3 + Region string `yaml:"region"` + Bucket string `yaml:"bucket"` + AccessKey string `yaml:"access_key"` + SecretKey string `yaml:"secret_key"` + UsePathStyle bool `yaml:"use_path_style"` // true for LocalStack +} + +// Cachestore is the in-DC chunk store configuration. +type Cachestore struct { + Driver string `yaml:"driver"` // "s3" in v1 + S3 CachestoreS3 `yaml:"s3"` +} + +// CachestoreS3 is the s3 driver configuration. In dev this points at +// LocalStack; in production at VAST or another in-DC S3-compatible +// store. +// +// Bucket versioning is unconditionally validated at startup: a +// versioned bucket silently breaks the no-clobber atomic-commit +// primitive (PutObject + If-None-Match: *) the driver depends on. +// There is no configuration switch for this gate. +type CachestoreS3 struct { + Endpoint string `yaml:"endpoint"` + Bucket string `yaml:"bucket"` + Region string `yaml:"region"` + AccessKey string `yaml:"access_key"` + SecretKey string `yaml:"secret_key"` + UsePathStyle bool `yaml:"use_path_style"` // true for LocalStack +} + +// Cluster captures peer discovery + internal-listener configuration. +type Cluster struct { + Service string `yaml:"service"` // headless Service FQDN + MembershipRefresh time.Duration `yaml:"membership_refresh"` // DNS poll interval + InternalListen string `yaml:"internal_listen"` + InternalTLS InternalTLS `yaml:"internal_tls"` + TargetReplicas int `yaml:"target_replicas"` + SelfPodIP string `yaml:"self_pod_ip"` // resolved from POD_IP env +} + +// InternalTLS governs the internal-listener mTLS posture. +// +// Production: enabled=true (mTLS required). +// Dev: enabled=false (plain HTTP/2). The binary logs WARN at startup. +type InternalTLS struct { + Enabled bool `yaml:"enabled"` + CertFile string `yaml:"cert_file"` + KeyFile string `yaml:"key_file"` + CAFile string `yaml:"ca_file"` + ServerName string `yaml:"server_name"` +} + +// ChunkCatalog is the in-memory chunk-presence cache configuration. +type ChunkCatalog struct { + MaxEntries int `yaml:"max_entries"` +} + +// Metadata is the object-metadata cache configuration. +type Metadata struct { + TTL time.Duration `yaml:"ttl"` + NegativeTTL time.Duration `yaml:"negative_ttl"` + MaxEntries int `yaml:"max_entries"` +} + +// Chunking governs chunk size and read-ahead for client GETs. +// +// Size is the base chunk size used for objects smaller than the +// smallest Tier threshold. Tiers, if non-empty, override Size for +// objects at or above each tier's MinObjectSize: the tier with the +// largest threshold <= the object's size wins. Tiers must be +// strictly ascending by MinObjectSize; the loader enforces this +// at validate time so the runtime selection path can assume sorted +// input. +// +// Readahead is the number of chunks the client-edge GET handler +// prefetches while streaming the current chunk to the client. It +// is a pointer so the loader can distinguish an omitted YAML field +// (defaults to 8) from an explicit "readahead: 0" (disables +// read-ahead and restores the strictly-sequential chunk-fetch +// behavior). The cost is bounded by readahead * effective_chunk_size +// of extra in-flight cachestore body buffers per concurrent GET; +// cold-fill speculation is additionally bounded by the per-replica +// origin semaphore (target_per_replica), so peak per-replica +// cold-buffer memory is at most: +// +// target_per_replica * max(Size, max ChunkSize across Tiers) +// +// With the defaults (Size=8 MiB, Tiers up to 128 MiB, 4 replicas at +// target_global=64), the per-replica ceiling is 16 * 128 MiB = 2 GiB. +// Operators with tighter memory budgets should lower the highest +// tier's ChunkSize or drop the largest-object tier entirely. +type Chunking struct { + Size int64 `yaml:"size"` // bytes per chunk; default 8 MiB + Tiers []ChunkTier `yaml:"tiers"` + Readahead *int `yaml:"readahead"` +} + +// ChunkTier is one entry in the Chunking.Tiers ladder. Objects whose +// size is at or above MinObjectSize use ChunkSize, unless a +// higher-threshold tier also matches (in which case the higher tier +// wins). Both fields must be > 0; ChunkSize must be >= 1 MiB (the +// floor that applies to Chunking.Size as well). +type ChunkTier struct { + MinObjectSize int64 `yaml:"min_object_size"` + ChunkSize int64 `yaml:"chunk_size"` +} + +// AsChunkTiers returns the configured tier ladder as a []chunk.Tier +// slice suitable for chunk.SizeFor. Returns nil for an empty list. +// The slice is in the validated ascending-MinObjectSize order. +func (c Chunking) AsChunkTiers() []chunk.Tier { + if len(c.Tiers) == 0 { + return nil + } + + out := make([]chunk.Tier, len(c.Tiers)) + for i, t := range c.Tiers { + out[i] = chunk.Tier{MinObjectSize: t.MinObjectSize, ChunkSize: t.ChunkSize} + } + + return out +} + +// ReadaheadDepth returns the configured read-ahead depth. A nil +// pointer (YAML omitted) returns 0; applyDefaults populates the +// default-on value so configurations that loaded through Load +// always have a non-nil pointer. Callers that bypass Load (e.g. +// hand-constructed test configs) get 0 for nil, which matches the +// "feature disabled" semantics. +func (c Chunking) ReadaheadDepth() int { + if c.Readahead == nil { + return 0 + } + + return *c.Readahead +} + +// Load reads the YAML config file at path and returns a populated +// Config. Defaults are applied for fields left at zero-value. +func Load(path string) (*Config, error) { + raw, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read %s: %w", path, err) + } + + cfg := &Config{} + if err := yaml.Unmarshal(raw, cfg); err != nil { + return nil, fmt.Errorf("yaml unmarshal: %w", err) + } + + cfg.applyDefaults() + + if err := cfg.validate(); err != nil { + return nil, fmt.Errorf("config invalid: %w", err) + } + + return cfg, nil +} + +func (c *Config) applyDefaults() { + // Server. + if c.Server.Listen == "" { + c.Server.Listen = "0.0.0.0:8443" + } + + if c.Server.OpsListen == "" { + c.Server.OpsListen = "0.0.0.0:8442" + } + // Origin. + if c.Origin.Driver == "" { + c.Origin.Driver = "azureblob" + } + + if c.Origin.TargetGlobal == 0 { + c.Origin.TargetGlobal = 192 + } + + if c.Origin.QueueTimeout == 0 { + c.Origin.QueueTimeout = 5 * time.Second + } + + if c.Origin.Retry.Attempts == 0 { + c.Origin.Retry.Attempts = 3 + } + + if c.Origin.Retry.BackoffInitial == 0 { + c.Origin.Retry.BackoffInitial = 100 * time.Millisecond + } + + if c.Origin.Retry.BackoffMax == 0 { + c.Origin.Retry.BackoffMax = 2 * time.Second + } + + if c.Origin.Retry.MaxTotalDuration == 0 { + c.Origin.Retry.MaxTotalDuration = 5 * time.Second + } + // Cachestore. + if c.Cachestore.Driver == "" { + c.Cachestore.Driver = "s3" + } + + if c.Cachestore.S3.Region == "" { + c.Cachestore.S3.Region = "us-east-1" + } + // Cluster. + if c.Cluster.MembershipRefresh == 0 { + c.Cluster.MembershipRefresh = 5 * time.Second + } + + if c.Cluster.InternalListen == "" { + c.Cluster.InternalListen = "0.0.0.0:8444" + } + + if c.Cluster.TargetReplicas == 0 { + c.Cluster.TargetReplicas = 3 + } + + if c.Cluster.InternalTLS.ServerName == "" { + c.Cluster.InternalTLS.ServerName = "orca..svc" + } + // Resolve self pod IP from env if not set in YAML. + if c.Cluster.SelfPodIP == "" { + c.Cluster.SelfPodIP = os.Getenv("POD_IP") + } + // Resolve credentials from env if not set in YAML. This lets the + // non-secret config live in a ConfigMap while credentials come from + // a Kubernetes Secret mounted as env vars (envFrom: secretRef). + if c.Origin.Azureblob.AccountKey == "" { + c.Origin.Azureblob.AccountKey = os.Getenv("ORCA_AZUREBLOB_ACCOUNT_KEY") + } + + if c.Origin.AWSS3.AccessKey == "" { + c.Origin.AWSS3.AccessKey = os.Getenv("ORCA_AWSS3_ACCESS_KEY") + } + + if c.Origin.AWSS3.SecretKey == "" { + c.Origin.AWSS3.SecretKey = os.Getenv("ORCA_AWSS3_SECRET_KEY") + } + + if c.Cachestore.S3.AccessKey == "" { + c.Cachestore.S3.AccessKey = os.Getenv("ORCA_CACHESTORE_S3_ACCESS_KEY") + } + + if c.Cachestore.S3.SecretKey == "" { + c.Cachestore.S3.SecretKey = os.Getenv("ORCA_CACHESTORE_S3_SECRET_KEY") + } + // awss3 region default. + if c.Origin.AWSS3.Region == "" { + c.Origin.AWSS3.Region = "us-east-1" + } + // Chunk catalog. + if c.ChunkCatalog.MaxEntries == 0 { + c.ChunkCatalog.MaxEntries = 100_000 + } + // Metadata. + if c.Metadata.TTL == 0 { + c.Metadata.TTL = 5 * time.Minute + } + + if c.Metadata.NegativeTTL == 0 { + c.Metadata.NegativeTTL = 60 * time.Second + } + + if c.Metadata.MaxEntries == 0 { + c.Metadata.MaxEntries = 10_000 + } + // Chunking. + if c.Chunking.Size == 0 { + c.Chunking.Size = 8 * 1024 * 1024 + } + // Tier ladder: default to a two-tier ramp that keeps small + // objects on the 8 MiB base size, bumps 1 GiB+ blobs to 64 MiB, + // and 10 GiB+ blobs to 128 MiB. Operators can replace or + // disable the ladder by setting tiers explicitly (including the + // empty list) in YAML. + if c.Chunking.Tiers == nil { + c.Chunking.Tiers = []ChunkTier{ + {MinObjectSize: 1024 * 1024 * 1024, ChunkSize: 64 * 1024 * 1024}, + {MinObjectSize: 10 * 1024 * 1024 * 1024, ChunkSize: 128 * 1024 * 1024}, + } + } + // Readahead defaults to 8 chunks when the YAML field is omitted. + // An explicit "readahead: 0" disables prefetch. + if c.Chunking.Readahead == nil { + d := 8 + c.Chunking.Readahead = &d + } + // Logging. + if c.Logging.Level == "" { + c.Logging.Level = "info" + } +} + +func (c *Config) validate() error { + if c.Origin.ID == "" { + return fmt.Errorf("origin.id is required") + } + + switch c.Origin.Driver { + case "azureblob": + if c.Origin.Azureblob.Account == "" { + return fmt.Errorf("origin.azureblob.account is required") + } + + if c.Origin.Azureblob.Container == "" { + return fmt.Errorf("origin.azureblob.container is required") + } + case "awss3": + if c.Origin.AWSS3.Bucket == "" { + return fmt.Errorf("origin.awss3.bucket is required") + } + default: + return fmt.Errorf("origin.driver %q unsupported; supported: azureblob, awss3", + c.Origin.Driver) + } + + if c.Cachestore.Driver != "s3" { + return fmt.Errorf("cachestore.driver %q unsupported; only s3 in v1", c.Cachestore.Driver) + } + + if c.Cachestore.S3.Endpoint == "" { + return fmt.Errorf("cachestore.s3.endpoint is required") + } + + if c.Cachestore.S3.Bucket == "" { + return fmt.Errorf("cachestore.s3.bucket is required") + } + + if c.Cluster.Service == "" { + return fmt.Errorf("cluster.service is required (headless Service FQDN)") + } + + if c.Cluster.SelfPodIP == "" { + return fmt.Errorf("cluster.self_pod_ip is required (typically resolved from POD_IP env)") + } + + if c.Cluster.TargetReplicas < 1 { + return fmt.Errorf("cluster.target_replicas must be >= 1") + } + + if c.Origin.TargetGlobal < c.Cluster.TargetReplicas { + return fmt.Errorf( + "origin.target_global=%d must be >= cluster.target_replicas=%d", + c.Origin.TargetGlobal, c.Cluster.TargetReplicas, + ) + } + + if c.Chunking.Size < 1024*1024 { + return fmt.Errorf("chunking.size %d too small; minimum 1 MiB", c.Chunking.Size) + } + + if err := validateChunkingTiers(c.Chunking.Tiers); err != nil { + return err + } + + if c.Chunking.Readahead != nil && *c.Chunking.Readahead < 0 { + return fmt.Errorf("chunking.readahead %d invalid; must be >= 0", *c.Chunking.Readahead) + } + + if _, err := ParseLogLevel(c.Logging.Level); err != nil { + return err + } + + return nil +} + +// validateChunkingTiers enforces the unambiguous-tier invariants the +// SizeFor selection rule depends on: every tier has positive bounds, +// the ChunkSize floor matches Chunking.Size's 1 MiB minimum, and +// MinObjectSize values are strictly ascending. Unsorted input is +// rejected (rather than silently sorted) so operators see the typo +// in their YAML rather than diagnosing a surprising chunk-size +// selection in production. +func validateChunkingTiers(tiers []ChunkTier) error { + for i, t := range tiers { + if t.MinObjectSize <= 0 { + return fmt.Errorf("chunking.tiers[%d].min_object_size %d invalid; must be > 0", + i, t.MinObjectSize) + } + + if t.ChunkSize < 1024*1024 { + return fmt.Errorf("chunking.tiers[%d].chunk_size %d too small; minimum 1 MiB", + i, t.ChunkSize) + } + + if i > 0 && t.MinObjectSize <= tiers[i-1].MinObjectSize { + return fmt.Errorf( + "chunking.tiers must be strictly ascending by min_object_size; "+ + "tiers[%d].min_object_size=%d is not greater than tiers[%d].min_object_size=%d", + i, t.MinObjectSize, i-1, tiers[i-1].MinObjectSize) + } + } + + return nil +} + +// ParseLogLevel maps an orca log-level string to slog.Level. Returns +// an error for unknown values. Empty string is treated as the +// configured default ("info"). Used both by config.validate at YAML +// parse time and by the cmd/orca entrypoint to honour the +// ORCA_LOG_LEVEL environment override. +func ParseLogLevel(s string) (slog.Level, error) { + switch strings.ToLower(strings.TrimSpace(s)) { + case "", "info": + return slog.LevelInfo, nil + case "debug": + return slog.LevelDebug, nil + case "warn", "warning": + return slog.LevelWarn, nil + case "error": + return slog.LevelError, nil + default: + return 0, fmt.Errorf("logging.level %q invalid; expected one of debug, info, warn, error", s) + } +} + +// TargetPerReplica returns the per-replica origin concurrency cap +// derived from origin.target_global divided by cluster.target_replicas. +// This bounds the number of concurrent in-flight origin requests this +// replica will issue. +func (c *Config) TargetPerReplica() int { + if c.Cluster.TargetReplicas <= 0 { + return c.Origin.TargetGlobal + } + + return c.Origin.TargetGlobal / c.Cluster.TargetReplicas +} diff --git a/internal/orca/config/config_test.go b/internal/orca/config/config_test.go new file mode 100644 index 00000000..2d64a097 --- /dev/null +++ b/internal/orca/config/config_test.go @@ -0,0 +1,635 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package config + +import ( + "log/slog" + "os" + "path/filepath" + "strings" + "testing" + "time" +) + +// TestApplyDefaults_EnvFallback verifies that applyDefaults populates +// credential / pod-identity fields from environment variables when +// the YAML omits them. This is the path used in production where the +// Kubernetes Secret is mounted via envFrom and the ConfigMap holds +// only the non-secret config. +// +// Each subtest sets one env var and checks that: +// - env-set, yaml-empty -> field populated from env. +// - env-unset, yaml-set -> field keeps yaml value. +// - env-set, yaml-set -> field keeps yaml value (yaml wins). +// - env-unset, yaml-empty -> field stays empty. +func TestApplyDefaults_EnvFallback(t *testing.T) { + tests := []struct { + envVar string + setVal func(c *Config, v string) + getVal func(c *Config) string + }{ + { + envVar: "POD_IP", + setVal: func(c *Config, v string) { c.Cluster.SelfPodIP = v }, + getVal: func(c *Config) string { return c.Cluster.SelfPodIP }, + }, + { + envVar: "ORCA_AZUREBLOB_ACCOUNT_KEY", + setVal: func(c *Config, v string) { c.Origin.Azureblob.AccountKey = v }, + getVal: func(c *Config) string { return c.Origin.Azureblob.AccountKey }, + }, + { + envVar: "ORCA_AWSS3_ACCESS_KEY", + setVal: func(c *Config, v string) { c.Origin.AWSS3.AccessKey = v }, + getVal: func(c *Config) string { return c.Origin.AWSS3.AccessKey }, + }, + { + envVar: "ORCA_AWSS3_SECRET_KEY", + setVal: func(c *Config, v string) { c.Origin.AWSS3.SecretKey = v }, + getVal: func(c *Config) string { return c.Origin.AWSS3.SecretKey }, + }, + { + envVar: "ORCA_CACHESTORE_S3_ACCESS_KEY", + setVal: func(c *Config, v string) { c.Cachestore.S3.AccessKey = v }, + getVal: func(c *Config) string { return c.Cachestore.S3.AccessKey }, + }, + { + envVar: "ORCA_CACHESTORE_S3_SECRET_KEY", + setVal: func(c *Config, v string) { c.Cachestore.S3.SecretKey = v }, + getVal: func(c *Config) string { return c.Cachestore.S3.SecretKey }, + }, + } + + for _, tt := range tests { + t.Run(tt.envVar, func(t *testing.T) { + t.Run("env_set/yaml_empty", func(t *testing.T) { + t.Setenv(tt.envVar, "from-env") + + c := &Config{} + c.applyDefaults() + + if got := tt.getVal(c); got != "from-env" { + t.Errorf("got %q want %q", got, "from-env") + } + }) + + t.Run("env_unset/yaml_set", func(t *testing.T) { + _ = os.Unsetenv(tt.envVar) //nolint:errcheck // best-effort + + c := &Config{} + tt.setVal(c, "from-yaml") + c.applyDefaults() + + if got := tt.getVal(c); got != "from-yaml" { + t.Errorf("got %q want %q", got, "from-yaml") + } + }) + + t.Run("env_set/yaml_set_yaml_wins", func(t *testing.T) { + t.Setenv(tt.envVar, "from-env") + + c := &Config{} + tt.setVal(c, "from-yaml") + c.applyDefaults() + + if got := tt.getVal(c); got != "from-yaml" { + t.Errorf("got %q want %q (yaml should win)", got, "from-yaml") + } + }) + + t.Run("env_unset/yaml_empty", func(t *testing.T) { + _ = os.Unsetenv(tt.envVar) //nolint:errcheck // best-effort + + c := &Config{} + c.applyDefaults() + + if got := tt.getVal(c); got != "" { + t.Errorf("got %q want empty", got) + } + }) + }) + } +} + +// TestApplyDefaults_FieldDefaults verifies that the hard-coded +// fallback values fire for every field whose zero value is replaced. +func TestApplyDefaults_FieldDefaults(t *testing.T) { + t.Parallel() + + c := &Config{} + c.applyDefaults() + + checks := []struct { + name string + got any + want any + }{ + {"server.listen", c.Server.Listen, "0.0.0.0:8443"}, + {"server.ops_listen", c.Server.OpsListen, "0.0.0.0:8442"}, + {"origin.driver", c.Origin.Driver, "azureblob"}, + {"origin.target_global", c.Origin.TargetGlobal, 192}, + {"origin.queue_timeout", c.Origin.QueueTimeout, 5 * time.Second}, + {"origin.retry.attempts", c.Origin.Retry.Attempts, 3}, + {"origin.retry.backoff_initial", c.Origin.Retry.BackoffInitial, 100 * time.Millisecond}, + {"origin.retry.backoff_max", c.Origin.Retry.BackoffMax, 2 * time.Second}, + {"origin.retry.max_total_duration", c.Origin.Retry.MaxTotalDuration, 5 * time.Second}, + {"cachestore.driver", c.Cachestore.Driver, "s3"}, + {"cachestore.s3.region", c.Cachestore.S3.Region, "us-east-1"}, + {"cluster.membership_refresh", c.Cluster.MembershipRefresh, 5 * time.Second}, + {"cluster.internal_listen", c.Cluster.InternalListen, "0.0.0.0:8444"}, + {"cluster.target_replicas", c.Cluster.TargetReplicas, 3}, + {"cluster.internal_tls.server_name", c.Cluster.InternalTLS.ServerName, "orca..svc"}, + {"chunk_catalog.max_entries", c.ChunkCatalog.MaxEntries, 100_000}, + {"metadata.ttl", c.Metadata.TTL, 5 * time.Minute}, + {"metadata.negative_ttl", c.Metadata.NegativeTTL, 60 * time.Second}, + {"metadata.max_entries", c.Metadata.MaxEntries, 10_000}, + {"chunking.size", c.Chunking.Size, int64(8 * 1024 * 1024)}, + {"origin.awss3.region", c.Origin.AWSS3.Region, "us-east-1"}, + {"logging.level", c.Logging.Level, "info"}, + } + + for _, ch := range checks { + if ch.got != ch.want { + t.Errorf("%s: got %v want %v", ch.name, ch.got, ch.want) + } + } + + // Tiers default to the documented 2-entry ladder. Compared + // separately since slice equality cannot use the table. + wantTiers := []ChunkTier{ + {MinObjectSize: 1024 * 1024 * 1024, ChunkSize: 64 * 1024 * 1024}, + {MinObjectSize: 10 * 1024 * 1024 * 1024, ChunkSize: 128 * 1024 * 1024}, + } + if len(c.Chunking.Tiers) != len(wantTiers) { + t.Errorf("chunking.tiers length=%d want %d", len(c.Chunking.Tiers), len(wantTiers)) + } else { + for i := range wantTiers { + if c.Chunking.Tiers[i] != wantTiers[i] { + t.Errorf("chunking.tiers[%d]=%+v want %+v", + i, c.Chunking.Tiers[i], wantTiers[i]) + } + } + } + // Readahead defaults to a non-nil pointer to 8. + if c.Chunking.Readahead == nil { + t.Errorf("chunking.readahead is nil; expected default pointer") + } else if *c.Chunking.Readahead != 8 { + t.Errorf("chunking.readahead=%d want 8", *c.Chunking.Readahead) + } +} + +// TestApplyDefaults_PreservesExplicitValues verifies that explicit +// non-zero values are not overwritten by applyDefaults. +func TestApplyDefaults_PreservesExplicitValues(t *testing.T) { + t.Parallel() + + c := &Config{ + Server: Server{Listen: "1.2.3.4:9000"}, + Origin: Origin{ + Driver: "awss3", + TargetGlobal: 64, + }, + Cachestore: Cachestore{S3: CachestoreS3{Region: "eu-west-1"}}, + Cluster: Cluster{TargetReplicas: 7, MembershipRefresh: 10 * time.Second}, + ChunkCatalog: ChunkCatalog{MaxEntries: 50}, + Metadata: Metadata{TTL: time.Hour, MaxEntries: 99}, + Chunking: Chunking{Size: 16 << 20}, + } + + c.applyDefaults() + + if c.Server.Listen != "1.2.3.4:9000" { + t.Errorf("Server.Listen overwritten: %q", c.Server.Listen) + } + + if c.Origin.Driver != "awss3" { + t.Errorf("Origin.Driver overwritten: %q", c.Origin.Driver) + } + + if c.Origin.TargetGlobal != 64 { + t.Errorf("Origin.TargetGlobal overwritten: %d", c.Origin.TargetGlobal) + } + + if c.Cachestore.S3.Region != "eu-west-1" { + t.Errorf("Cachestore.S3.Region overwritten: %q", c.Cachestore.S3.Region) + } + + if c.Cluster.TargetReplicas != 7 { + t.Errorf("Cluster.TargetReplicas overwritten: %d", c.Cluster.TargetReplicas) + } + + if c.Cluster.MembershipRefresh != 10*time.Second { + t.Errorf("Cluster.MembershipRefresh overwritten: %v", c.Cluster.MembershipRefresh) + } + + if c.ChunkCatalog.MaxEntries != 50 { + t.Errorf("ChunkCatalog.MaxEntries overwritten: %d", c.ChunkCatalog.MaxEntries) + } + + if c.Metadata.TTL != time.Hour { + t.Errorf("Metadata.TTL overwritten: %v", c.Metadata.TTL) + } + + if c.Chunking.Size != 16<<20 { + t.Errorf("Chunking.Size overwritten: %d", c.Chunking.Size) + } +} + +// TestLoad_Validate covers the validate() error paths. +func TestLoad_Validate(t *testing.T) { + // No t.Parallel: subtests use t.Setenv to neutralize POD_IP. + tests := []struct { + name string + yaml string + wantErr string + wantOK bool + }{ + { + name: "valid awss3 config", + yaml: validAwss3YAML, + wantOK: true, + }, + { + name: "missing origin.id", + yaml: strings.ReplaceAll(validAwss3YAML, "id: test-origin", "id: \"\""), + wantErr: "origin.id is required", + }, + { + name: "unsupported driver", + yaml: strings.ReplaceAll(validAwss3YAML, "driver: awss3", "driver: ftp"), + wantErr: "origin.driver", + }, + { + name: "missing awss3 bucket", + yaml: strings.ReplaceAll(validAwss3YAML, "bucket: orca-origin", "bucket: \"\""), + wantErr: "origin.awss3.bucket is required", + }, + { + name: "missing cachestore endpoint", + yaml: strings.ReplaceAll(validAwss3YAML, "endpoint: http://localstack:4566", "endpoint: \"\""), + wantErr: "cachestore.s3.endpoint is required", + }, + { + name: "missing cluster service", + yaml: strings.ReplaceAll(validAwss3YAML, "service: orca-peers.svc", "service: \"\""), + wantErr: "cluster.service is required", + }, + { + name: "missing self_pod_ip when POD_IP unset", + yaml: strings.ReplaceAll(validAwss3YAML, "self_pod_ip: 10.0.0.1", "self_pod_ip: \"\""), + wantErr: "self_pod_ip is required", + }, + { + name: "target_replicas negative", + yaml: strings.ReplaceAll(validAwss3YAML, "target_replicas: 3", "target_replicas: -1"), + wantErr: "target_replicas", + }, + { + name: "chunking size below minimum", + yaml: strings.ReplaceAll(validAwss3YAML, "size: 8388608", "size: 4096"), + wantErr: "chunking.size", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Ensure no leakage of POD_IP from the test process env. + t.Setenv("POD_IP", "") + + path := writeTempYAML(t, tt.yaml) + + _, err := Load(path) + if tt.wantOK { + if err != nil { + t.Fatalf("expected nil error, got %v", err) + } + + return + } + + if err == nil { + t.Fatalf("expected error containing %q, got nil", tt.wantErr) + } + + if !strings.Contains(err.Error(), tt.wantErr) { + t.Errorf("error %q does not contain %q", err.Error(), tt.wantErr) + } + }) + } +} + +// TestValidateChunkingTiers_OK covers tier ladders that should pass +// validation: empty (feature off), single tier, multi-tier strictly +// ascending. +func TestValidateChunkingTiers_OK(t *testing.T) { + t.Parallel() + + cases := [][]ChunkTier{ + nil, + {}, + {{MinObjectSize: 1 << 30, ChunkSize: 64 << 20}}, + { + {MinObjectSize: 1 << 30, ChunkSize: 64 << 20}, + {MinObjectSize: 10 << 30, ChunkSize: 128 << 20}, + }, + } + + for i, tiers := range cases { + if err := validateChunkingTiers(tiers); err != nil { + t.Errorf("case[%d] unexpected error: %v", i, err) + } + } +} + +// TestValidateChunkingTiers_Errors covers the rejection paths: tiny +// chunk size, zero / negative min object size, unsorted thresholds, +// and duplicate thresholds (caught by the strict-ascending rule). +func TestValidateChunkingTiers_Errors(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + tiers []ChunkTier + wantErr string + }{ + { + name: "chunk size below 1 MiB", + tiers: []ChunkTier{ + {MinObjectSize: 1 << 30, ChunkSize: 1024}, + }, + wantErr: "chunk_size", + }, + { + name: "zero min object size", + tiers: []ChunkTier{ + {MinObjectSize: 0, ChunkSize: 64 << 20}, + }, + wantErr: "min_object_size", + }, + { + name: "negative min object size", + tiers: []ChunkTier{ + {MinObjectSize: -1, ChunkSize: 64 << 20}, + }, + wantErr: "min_object_size", + }, + { + name: "unsorted ascending rejected", + tiers: []ChunkTier{ + {MinObjectSize: 10 << 30, ChunkSize: 64 << 20}, + {MinObjectSize: 1 << 30, ChunkSize: 128 << 20}, + }, + wantErr: "strictly ascending", + }, + { + name: "duplicate min object size rejected", + tiers: []ChunkTier{ + {MinObjectSize: 1 << 30, ChunkSize: 64 << 20}, + {MinObjectSize: 1 << 30, ChunkSize: 128 << 20}, + }, + wantErr: "strictly ascending", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateChunkingTiers(tt.tiers) + if err == nil { + t.Fatalf("expected error containing %q, got nil", tt.wantErr) + } + + if !strings.Contains(err.Error(), tt.wantErr) { + t.Errorf("error %q does not contain %q", err.Error(), tt.wantErr) + } + }) + } +} + +// TestLoad_TiersAndReadahead drives validation through Load (full +// YAML path) to ensure the tier rejection surfaces with the rich +// error message and that an explicit readahead: 0 disables prefetch +// (i.e. survives applyDefaults and is not bumped back to 8). +func TestLoad_TiersAndReadahead(t *testing.T) { + t.Parallel() + + t.Run("explicit_readahead_zero_preserved", func(t *testing.T) { + yaml := validAwss3YAML + " readahead: 0\n" + path := writeTempYAML(t, yaml) + + cfg, err := Load(path) + if err != nil { + t.Fatalf("Load: %v", err) + } + + if cfg.Chunking.Readahead == nil { + t.Fatalf("Readahead should be non-nil after applyDefaults") + } + + if *cfg.Chunking.Readahead != 0 { + t.Errorf("Readahead=%d want 0 (explicit disable preserved)", *cfg.Chunking.Readahead) + } + + if d := cfg.Chunking.ReadaheadDepth(); d != 0 { + t.Errorf("ReadaheadDepth()=%d want 0", d) + } + }) + + t.Run("explicit_empty_tiers_preserved", func(t *testing.T) { + yaml := validAwss3YAML + " tiers: []\n" + path := writeTempYAML(t, yaml) + + cfg, err := Load(path) + if err != nil { + t.Fatalf("Load: %v", err) + } + // Tiers explicitly set to [] should survive applyDefaults + // (the default ladder must not overwrite operator intent). + if len(cfg.Chunking.Tiers) != 0 { + t.Errorf("Tiers=%v want []; applyDefaults overwrote explicit empty", + cfg.Chunking.Tiers) + } + + if cfg.Chunking.AsChunkTiers() != nil { + t.Errorf("AsChunkTiers() returned non-nil for empty tiers") + } + }) + + t.Run("unsorted_tiers_rejected", func(t *testing.T) { + yaml := validAwss3YAML + ` tiers: + - min_object_size: 10737418240 + chunk_size: 67108864 + - min_object_size: 1073741824 + chunk_size: 134217728 +` + path := writeTempYAML(t, yaml) + + _, err := Load(path) + if err == nil { + t.Fatalf("Load accepted unsorted tiers") + } + + if !strings.Contains(err.Error(), "strictly ascending") { + t.Errorf("error %q does not mention strict ascending order", err.Error()) + } + }) + + t.Run("negative_readahead_rejected", func(t *testing.T) { + yaml := validAwss3YAML + " readahead: -1\n" + path := writeTempYAML(t, yaml) + + _, err := Load(path) + if err == nil { + t.Fatalf("Load accepted negative readahead") + } + + if !strings.Contains(err.Error(), "chunking.readahead") { + t.Errorf("error %q does not mention chunking.readahead", err.Error()) + } + }) +} + +// TestChunking_AsChunkTiers covers the config -> chunk.Tier mapping +// preserves order and field values, and returns nil for empty. +func TestChunking_AsChunkTiers(t *testing.T) { + t.Parallel() + + c := Chunking{ + Size: 8 << 20, + Tiers: []ChunkTier{ + {MinObjectSize: 1 << 30, ChunkSize: 64 << 20}, + {MinObjectSize: 10 << 30, ChunkSize: 128 << 20}, + }, + } + + got := c.AsChunkTiers() + if len(got) != 2 { + t.Fatalf("len=%d want 2", len(got)) + } + + if got[0].MinObjectSize != 1<<30 || got[0].ChunkSize != 64<<20 { + t.Errorf("got[0]=%+v", got[0]) + } + + if got[1].MinObjectSize != 10<<30 || got[1].ChunkSize != 128<<20 { + t.Errorf("got[1]=%+v", got[1]) + } + + if (Chunking{}).AsChunkTiers() != nil { + t.Errorf("empty Chunking.AsChunkTiers() should be nil") + } +} + +// TestParseLogLevel covers the orca log-level string -> slog.Level +// mapping. Both empty and "info" map to LevelInfo so the YAML default +// path matches the explicit-info path; "warn" and "warning" are +// accepted equivalently. Unknown values return a descriptive error +// so misconfiguration is surfaced rather than silently downgrading. +func TestParseLogLevel(t *testing.T) { + t.Parallel() + + tests := []struct { + in string + want slog.Level + wantErr bool + }{ + {"", slog.LevelInfo, false}, + {"info", slog.LevelInfo, false}, + {"INFO", slog.LevelInfo, false}, + {"debug", slog.LevelDebug, false}, + {" Debug ", slog.LevelDebug, false}, + {"warn", slog.LevelWarn, false}, + {"warning", slog.LevelWarn, false}, + {"error", slog.LevelError, false}, + {"trace", 0, true}, + {"verbose", 0, true}, + {"5", 0, true}, + } + + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + got, err := ParseLogLevel(tt.in) + if tt.wantErr { + if err == nil { + t.Errorf("ParseLogLevel(%q) = %v, want error", tt.in, got) + } + + return + } + + if err != nil { + t.Errorf("ParseLogLevel(%q) unexpected err: %v", tt.in, err) + return + } + + if got != tt.want { + t.Errorf("ParseLogLevel(%q) = %v, want %v", tt.in, got, tt.want) + } + }) + } +} + +// TestValidate_RejectsInvalidLogLevel verifies that an unrecognised +// logging.level value is caught at config.Load time rather than at +// process startup. +func TestValidate_RejectsInvalidLogLevel(t *testing.T) { + t.Parallel() + + yaml := validAwss3YAML + ` +logging: + level: trace +` + path := writeTempYAML(t, yaml) + + _, err := Load(path) + if err == nil { + t.Fatalf("Load accepted invalid logging.level: trace") + } + + if !strings.Contains(err.Error(), "logging.level") { + t.Errorf("error does not mention logging.level: %v", err) + } +} + +func writeTempYAML(t *testing.T, content string) string { + t.Helper() + + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + + if err := os.WriteFile(path, []byte(content), 0o600); err != nil { + t.Fatalf("write temp yaml: %v", err) + } + + return path +} + +const validAwss3YAML = ` +server: + listen: 0.0.0.0:8443 +origin: + id: test-origin + driver: awss3 + awss3: + endpoint: http://localstack:4566 + region: us-east-1 + bucket: orca-origin + access_key: test + secret_key: test + use_path_style: true +cachestore: + driver: s3 + s3: + endpoint: http://localstack:4566 + bucket: orca-cache + region: us-east-1 + access_key: test + secret_key: test + use_path_style: true +cluster: + service: orca-peers.svc + self_pod_ip: 10.0.0.1 + target_replicas: 3 +chunking: + size: 8388608 +` diff --git a/internal/orca/fetch/fetch.go b/internal/orca/fetch/fetch.go new file mode 100644 index 00000000..5b1865cc --- /dev/null +++ b/internal/orca/fetch/fetch.go @@ -0,0 +1,563 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package fetch is the per-replica fill orchestrator: per-ChunkKey +// singleflight, pre-header origin retry, per-replica origin +// concurrency cap, and cross-replica fill via the cluster's internal +// RPC. +// +// The dedup model is per-replica singleflight + cluster-wide dedup +// via a rendezvous-hashed coordinator. No disk spool; joiners stream +// from the leader's in-memory ring buffer. +// +// Pre-header retry: the coordinator may retry origin GETs up to the +// budget in cfg.Origin.Retry until the first byte is committed to +// the client response. Once headers are sent retries are not safe and +// failures become mid-stream aborts. +package fetch + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "log/slog" + "sync" + "time" + + "github.com/Azure/unbounded/internal/orca/cachestore" + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/chunkcatalog" + "github.com/Azure/unbounded/internal/orca/cluster" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/metadata" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// Coordinator orchestrates per-replica chunk fills. +type Coordinator struct { + or origin.Origin + cs cachestore.CacheStore + cl *cluster.Cluster + cat *chunkcatalog.Catalog + mc *metadata.Cache + cfg *config.Config + log *slog.Logger + + // Per-replica origin concurrency cap. Bounds in-flight + // Origin.GetRange calls to floor(target_global / target_replicas). + originSem chan struct{} + + // Per-ChunkKey singleflight. Concurrent local fills for the same + // chunk collapse to one origin GetRange. + mu sync.Mutex + inflight map[string]*fill +} + +type fill struct { + done chan struct{} + bodyBuf *bytes.Buffer // buffered chunk after fetch (in-memory, bounded by chunk size) + err error +} + +// NewCoordinator wires up the fetch coordinator. The log is used for +// peer-fallback warnings and commit-after-serve failure traces, plus +// debug-level tracing through every chunk-resolution decision point +// when the operator enables logging.level: debug. The caller (usually +// app.Start) injects the app-wide slog.Logger so fetch-path logs are +// unified with the rest of the runtime's output. Passing nil falls +// back to slog.Default(). +func NewCoordinator( + or origin.Origin, + cs cachestore.CacheStore, + cl *cluster.Cluster, + cat *chunkcatalog.Catalog, + mc *metadata.Cache, + cfg *config.Config, + log *slog.Logger, +) *Coordinator { + tpr := cfg.TargetPerReplica() + if tpr < 1 { + tpr = 1 + } + + if log == nil { + log = slog.Default() + } + + return &Coordinator{ + or: or, + cs: cs, + cl: cl, + cat: cat, + mc: mc, + cfg: cfg, + log: log, + originSem: make(chan struct{}, tpr), + inflight: make(map[string]*fill), + } +} + +// Origin returns the underlying origin (used by the LIST passthrough). +func (c *Coordinator) Origin() origin.Origin { return c.or } + +// HeadObject returns object metadata, satisfying client HEAD requests. +// +// Rejects responses with an empty ETag via origin.MissingETagError. +// chunk.Path encodes the ETag in its hash input; a stable cache key +// requires the origin to supply one. Without an ETag, two different +// versions of the same (bucket, key) would alias to the same +// chunk.Path and serve stale bytes silently. The negative result is +// cached at NegativeTTL so we do not re-Head a misconfigured origin +// on every request. +func (c *Coordinator) HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + c.log.LogAttrs(ctx, slog.LevelDebug, "head_object", + slog.String("origin_id", c.cfg.Origin.ID), + slog.String("bucket", bucket), + slog.String("key", key), + ) + + return c.mc.LookupOrFetch(ctx, c.cfg.Origin.ID, bucket, key, + func(ctx context.Context) (origin.ObjectInfo, error) { + info, err := c.or.Head(ctx, bucket, key) + if err != nil { + return info, err + } + + if info.ETag == "" { + return info, &origin.MissingETagError{Bucket: bucket, Key: key} + } + + return info, nil + }) +} + +// GetChunk returns a reader over the chunk's bytes, fulfilling either +// from CacheStore (hit) or by orchestrating a cluster-wide +// dedup'd fill (miss). +// +// objectSize is the authoritative size of the object the chunk +// belongs to (from origin Head). It is used to clamp the cachestore +// read length and to size the tail chunk correctly on a miss. +// +// On miss: +// - If self is the coordinator: run local fill (origin GET via retry, +// atomic commit to CacheStore, populate buffer for joiners). +// - If a peer is the coordinator: send /internal/fill to that peer; +// stream from peer's response. On 409 Conflict, fall back to local +// fill. +func (c *Coordinator) GetChunk(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) { + c.log.LogAttrs(ctx, slog.LevelDebug, "get_chunk", + chunkAttrs(k), + slog.Int64("object_size", objectSize), + slog.Int64("expected_len", k.ExpectedLen(objectSize)), + ) + + if rc, hit, err := c.lookupOrStat(ctx, k, objectSize); err != nil { + return nil, err + } else if hit { + return rc, nil + } + + // Cluster-wide dedup: route to coordinator. + coord := c.cl.Coordinator(k) + + c.log.LogAttrs(ctx, slog.LevelDebug, "coordinator_selected", + chunkAttrs(k), + slog.String("coord_ip", coord.IP), + slog.Bool("is_self", coord.Self), + ) + + if !coord.Self { + c.log.LogAttrs(ctx, slog.LevelDebug, "peer_fill_attempt", + chunkAttrs(k), + slog.String("peer_ip", coord.IP), + ) + + rc, err := c.cl.FillFromPeer(ctx, coord, k, objectSize) + if err == nil { + c.log.LogAttrs(ctx, slog.LevelDebug, "peer_fill_success", + chunkAttrs(k), + slog.String("peer_ip", coord.IP), + ) + + return rc, nil + } + + if errors.Is(err, cluster.ErrPeerNotCoordinator) { + c.log.LogAttrs(ctx, slog.LevelWarn, "peer reported not-coordinator; falling back to local fill", + chunkAttrs(k), + slog.String("peer_ip", coord.IP), + ) + // fall through to local fill + } else { + c.log.LogAttrs(ctx, slog.LevelWarn, "internal-fill RPC failed; falling back to local fill", + chunkAttrs(k), + slog.String("peer_ip", coord.IP), + slog.Any("err", err), + ) + } + } + + return c.fillLocal(ctx, k, objectSize) +} + +// FillForPeer is the path taken by the /internal/fill handler. +// +// The receiver becomes the leader for this fill (or joins an in-flight +// fill for the same key). Returns a streaming body of the entire chunk. +func (c *Coordinator) FillForPeer(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) { + c.log.LogAttrs(ctx, slog.LevelDebug, "fill_for_peer", + chunkAttrs(k), + slog.Int64("object_size", objectSize), + ) + + if rc, hit, err := c.lookupOrStat(ctx, k, objectSize); err != nil { + return nil, err + } else if hit { + return rc, nil + } + + return c.fillLocal(ctx, k, objectSize) +} + +// lookupOrStat is the shared catalog-hit / cachestore-stat probe used +// by both GetChunk and FillForPeer. Returns (body, true, nil) when a +// pre-existing chunk is found, (nil, false, nil) on a clean miss +// (caller should run the appropriate fill path), or (nil, false, err) +// for non-recoverable cachestore errors. +// +// On a catalog hit that turns out to be stale (cachestore returns +// ErrNotFound), the catalog entry is forgotten so the next call +// re-stats fresh. +func (c *Coordinator) lookupOrStat(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, bool, error) { + expected := k.ExpectedLen(objectSize) + + if c.cat.Lookup(k) { + c.log.LogAttrs(ctx, slog.LevelDebug, "catalog_hit", + chunkAttrs(k), + ) + + rc, err := c.cs.GetChunk(ctx, k, 0, expected) + if err == nil { + return rc, true, nil + } + + if errors.Is(err, cachestore.ErrNotFound) { + c.log.LogAttrs(ctx, slog.LevelDebug, "catalog_stale_forgotten", + chunkAttrs(k), + ) + c.cat.Forget(k) + // fall through to stat + } else { + return nil, false, err + } + } + + info, err := c.cs.Stat(ctx, k) + if err != nil { + if errors.Is(err, cachestore.ErrNotFound) { + c.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_stat_miss", + chunkAttrs(k), + ) + + return nil, false, nil + } + + return nil, false, err + } + + c.log.LogAttrs(ctx, slog.LevelDebug, "cachestore_stat_hit", + chunkAttrs(k), + slog.Int64("size", info.Size), + ) + + c.cat.Record(k) + + // Trust the stat's reported size if it disagrees with our + // expectation (e.g. older committed entry from before a chunk + // size change), but clamp to the expected length so a corrupt + // larger stat does not leak bytes past the object end. + readLen := info.Size + if expected > 0 && readLen > expected { + readLen = expected + } + + rc, err := c.cs.GetChunk(ctx, k, 0, readLen) + if err != nil { + return nil, false, err + } + + return rc, true, nil +} + +// fillLocal runs (or joins) the singleflight for k on this replica. +func (c *Coordinator) fillLocal(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) { + path := k.Path() + + c.mu.Lock() + + f, ok := c.inflight[path] + if !ok { + f = &fill{done: make(chan struct{})} + c.inflight[path] = f + c.mu.Unlock() + + c.log.LogAttrs(ctx, slog.LevelDebug, "fill_local_lead", + chunkAttrs(k), + ) + + go c.runFill(k, objectSize, f) + } else { + c.mu.Unlock() + c.log.LogAttrs(ctx, slog.LevelDebug, "fill_local_join", + chunkAttrs(k), + ) + } + + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-f.done: + } + + if f.err != nil { + return nil, f.err + } + + return io.NopCloser(bytes.NewReader(f.bodyBuf.Bytes())), nil +} + +func (c *Coordinator) runFill(k chunk.Key, objectSize int64, f *fill) { + // runFill runs on a fill-scoped detached context (not the + // caller's) so it can complete the cachestore commit step even + // if the originating client disconnects mid-stream. The 5-minute + // ceiling bounds the cost: a fill no joiner ever reads still + // releases its origin-semaphore slot and clears its inflight + // entry within the budget. Peak per-fill heap is one ChunkSize + // bytes.Buffer (8 MiB default). + // + // Commit-after-serve ordering: once the origin body is fully + // fetched and validated, joiners are released (close(f.done)) + // BEFORE the PutChunk RPC begins. This shaves joiner latency by + // the cachestore commit time on the cold-fill path: joiners get + // bytes as soon as origin delivered them, and the commit runs in + // parallel from the joiners' perspective. Correctness is + // preserved because the buffer is fully populated and + // length-validated before release; PutChunk reads buf.Bytes() + // concurrently with joiner reads, but bytes.Buffer is never + // mutated after the final io.Copy returns, so the underlying + // byte slice is effectively immutable and safe for concurrent + // reads. + // + // release() is sync.Once-wrapped so close(f.done) fires exactly + // once whether via the explicit success-path call or the deferred + // safety net (which catches panic paths). + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + var releaseOnce sync.Once + + release := func() { + releaseOnce.Do(func() { close(f.done) }) + } + + defer func() { + release() + c.mu.Lock() + delete(c.inflight, k.Path()) + c.mu.Unlock() + }() + + // Acquire per-replica origin slot. + queueCtx, queueCancel := context.WithTimeout(ctx, c.cfg.Origin.QueueTimeout) + defer queueCancel() + + select { + case c.originSem <- struct{}{}: + case <-queueCtx.Done(): + f.err = fmt.Errorf("origin: queue timeout (cap=%d)", cap(c.originSem)) + return + } + + defer func() { <-c.originSem }() + + c.log.LogAttrs(ctx, slog.LevelDebug, "origin_slot_acquired", + chunkAttrs(k), + slog.Int("slot_cap", cap(c.originSem)), + ) + + // expectedLen is the authoritative number of bytes we should + // receive from origin: ChunkSize for non-tail chunks, the + // remainder for the tail. Production callers always supply a + // known objectSize, so expectedLen > 0; the wire format + // (DecodeChunkKey) and edge handler both reject the + // objectSize == 0 case at their boundaries, so the validation + // below is always exercised. + expectedLen := k.ExpectedLen(objectSize) + off := k.Index * k.ChunkSize + + body, err := c.fetchWithRetry(ctx, k, off, expectedLen) + if err != nil { + f.err = err + return + } + defer body.Close() //nolint:errcheck // origin body close best-effort + + buf := &bytes.Buffer{} + if _, err := io.Copy(buf, body); err != nil { + f.err = fmt.Errorf("fill copy: %w", err) + return + } + + c.log.LogAttrs(ctx, slog.LevelDebug, "origin_body_received", + chunkAttrs(k), + slog.Int("bytes", buf.Len()), + slog.Int64("expected_len", expectedLen), + ) + + if int64(buf.Len()) != expectedLen { + f.err = fmt.Errorf("origin returned %d bytes, expected %d (chunk=%s)", + buf.Len(), expectedLen, k.String()) + + return + } + + f.bodyBuf = buf + + // Release joiners BEFORE the PutChunk commit. Joiners' reads of + // f.bodyBuf.Bytes() are safe to overlap with the PutChunk RPC's + // read of the same slice: bytes.Buffer's internal slice is no + // longer mutated after io.Copy returned above. + release() + + // Atomic commit to CacheStore (asynchronous from joiners' + // perspective; they have their bytes already). + commitErr := c.cs.PutChunk(ctx, k, int64(buf.Len()), bytes.NewReader(buf.Bytes())) + + switch { + case commitErr == nil: + c.cat.Record(k) + c.log.LogAttrs(ctx, slog.LevelDebug, "commit_success", + chunkAttrs(k), + slog.Int("bytes", buf.Len()), + ) + case errors.Is(commitErr, cachestore.ErrCommitLost): + // Another replica won; treat existing CacheStore entry as truth. + c.log.LogAttrs(ctx, slog.LevelDebug, "commit_lost", + chunkAttrs(k), + ) + + if _, err := c.cs.Stat(ctx, k); err == nil { + c.cat.Record(k) + } else { + // Stat failed after a lost commit: cachestore is likely + // unhealthy (transient or otherwise). Catalog stays + // unrecorded (next request refills), but log so operators + // can see cachestore flapping. + c.log.LogAttrs(ctx, slog.LevelDebug, "commit_lost_stat_failed", + chunkAttrs(k), + slog.Any("err", err), + ) + } + default: + c.log.LogAttrs(ctx, slog.LevelWarn, "commit-after-serve failed", + chunkAttrs(k), + slog.Any("err", commitErr), + ) + // Don't record in catalog; next request refills. + } +} + +func (c *Coordinator) fetchWithRetry(ctx context.Context, k chunk.Key, off, length int64) (io.ReadCloser, error) { + deadline := time.Now().Add(c.cfg.Origin.Retry.MaxTotalDuration) + backoff := c.cfg.Origin.Retry.BackoffInitial + + var lastErr error + + for attempt := 1; attempt <= c.cfg.Origin.Retry.Attempts; attempt++ { + if err := ctx.Err(); err != nil { + return nil, err + } + + if time.Now().After(deadline) { + return nil, fmt.Errorf("origin retry exhausted (duration); last err: %w", lastErr) + } + + c.log.LogAttrs(ctx, slog.LevelDebug, "origin_get_range_attempt", + chunkAttrs(k), + slog.Int("attempt", attempt), + slog.Int64("off", off), + slog.Int64("length", length), + ) + + body, err := c.or.GetRange(ctx, k.Bucket, k.ObjectKey, k.ETag, off, length) + if err == nil { + c.log.LogAttrs(ctx, slog.LevelDebug, "origin_get_range_ok", + chunkAttrs(k), + slog.Int("attempt", attempt), + ) + + return body, nil + } + + lastErr = err + // Non-retryable: ETag changed. + var etagChanged *origin.OriginETagChangedError + if errors.As(err, &etagChanged) { + c.log.LogAttrs(ctx, slog.LevelDebug, "origin_etag_changed", + chunkAttrs(k), + slog.Int("attempt", attempt), + ) + c.mc.Invalidate(c.cfg.Origin.ID, k.Bucket, k.ObjectKey) + + return nil, err + } + // Non-retryable: not found. + if errors.Is(err, origin.ErrNotFound) { + c.log.LogAttrs(ctx, slog.LevelDebug, "origin_not_found", + chunkAttrs(k), + slog.Int("attempt", attempt), + ) + + return nil, err + } + + c.log.LogAttrs(ctx, slog.LevelDebug, "origin_retryable_error", + chunkAttrs(k), + slog.Int("attempt", attempt), + slog.Any("err", err), + slog.Duration("next_backoff", backoff), + ) + // Backoff. + if attempt < c.cfg.Origin.Retry.Attempts { + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(backoff): + } + + backoff *= 2 + if backoff > c.cfg.Origin.Retry.BackoffMax { + backoff = c.cfg.Origin.Retry.BackoffMax + } + } + } + + return nil, fmt.Errorf("origin retry exhausted (attempts); last err: %w", lastErr) +} + +// chunkAttrs returns a slog.Attr group identifying the chunk by its +// (origin, bucket, key, index) tuple. Used at every fetch-path log +// callsite for consistent grep / filter syntax across emissions. +// ETag is intentionally not surfaced here - log it via slog.String +// where needed using the chunk.Key's truncated String() form. +func chunkAttrs(k chunk.Key) slog.Attr { + return slog.Group("chunk", + slog.String("origin_id", k.OriginID), + slog.String("bucket", k.Bucket), + slog.String("key", k.ObjectKey), + slog.Int64("index", k.Index), + ) +} diff --git a/internal/orca/fetch/fetch_test.go b/internal/orca/fetch/fetch_test.go new file mode 100644 index 00000000..617d5eab --- /dev/null +++ b/internal/orca/fetch/fetch_test.go @@ -0,0 +1,450 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package fetch + +import ( + "bytes" + "context" + "errors" + "io" + "log/slog" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/Azure/unbounded/internal/orca/cachestore" + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/chunkcatalog" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/metadata" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// TestNewCoordinator_UsesInjectedLogger verifies the constructor +// stores the provided slog.Logger on the Coordinator. The peer-RPC +// fallback warnings and commit-after-serve failure traces emitted +// from the fetch path must flow through this logger rather than +// slog.Default(), so operators can route fetch logs alongside the +// rest of the app's structured output. +func TestNewCoordinator_UsesInjectedLogger(t *testing.T) { + t.Parallel() + + injected := slog.New(slog.NewTextHandler(io.Discard, nil)) + c := NewCoordinator(nil, nil, nil, nil, nil, &config.Config{}, injected) + + if c.log != injected { + t.Errorf("Coordinator.log not the injected logger") + } +} + +// TestNewCoordinator_NilLoggerFallsBackToDefault locks the contract +// that a nil logger falls back to slog.Default() rather than panicking +// during peer fallback or commit-after-serve. +func TestNewCoordinator_NilLoggerFallsBackToDefault(t *testing.T) { + t.Parallel() + + c := NewCoordinator(nil, nil, nil, nil, nil, &config.Config{}, nil) + if c.log == nil { + t.Errorf("nil logger should have fallen back to slog.Default()") + } +} + +// TestChunkAttrs_GroupShape locks the slog attribute taxonomy used +// by every fetch-path emission. The 'chunk' group must contain the +// (origin_id, bucket, key, index) identifying tuple so operator +// queries can grep on a single, consistent attribute path. +func TestChunkAttrs_GroupShape(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + + log := slog.New(slog.NewTextHandler(&buf, + &slog.HandlerOptions{Level: slog.LevelDebug})) + + log.LogAttrs(context.Background(), slog.LevelDebug, "probe", chunkAttrs(chunk.Key{ + OriginID: "origin-x", + Bucket: "bkt", + ObjectKey: "obj", + ChunkSize: 1024, + Index: 7, + })) + + out := buf.String() + for _, want := range []string{ + "chunk.origin_id=origin-x", + "chunk.bucket=bkt", + "chunk.key=obj", + "chunk.index=7", + } { + if !strings.Contains(out, want) { + t.Errorf("chunkAttrs output missing %q; got %q", want, out) + } + } +} + +// TestCoordinator_DebugEmissionsAtDebugLevel exercises a sample of +// the fetch-path debug emissions and asserts they reach the +// handler. We cannot drive the full GetChunk path here without +// standing up the entire dependency graph, so we exercise the +// representative log statements directly. The contract under test +// is that the call sites use LogAttrs at Debug level (so zero-cost +// at Info+) and emit the standardized 'chunk' attribute group. +func TestCoordinator_DebugEmissionsAtDebugLevel(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + + log := slog.New(slog.NewTextHandler(&buf, + &slog.HandlerOptions{Level: slog.LevelDebug})) + c := &Coordinator{log: log} + + k := chunk.Key{ + OriginID: "ox", + Bucket: "bkt", + ObjectKey: "obj", + ChunkSize: 1024, + Index: 3, + } + // Sample emissions corresponding to lookupOrStat hits, + // peer-fill route selection, and commit success. + c.log.LogAttrs(context.Background(), slog.LevelDebug, "catalog_hit", chunkAttrs(k)) + c.log.LogAttrs(context.Background(), slog.LevelDebug, "peer_fill_attempt", + chunkAttrs(k), slog.String("peer_ip", "10.0.0.5")) + c.log.LogAttrs(context.Background(), slog.LevelDebug, "commit_success", + chunkAttrs(k), slog.Int("bytes", 1024)) + + out := buf.String() + for _, want := range []string{"catalog_hit", "peer_fill_attempt", "commit_success", "chunk.index=3"} { + if !strings.Contains(out, want) { + t.Errorf("expected %q in debug output; got %q", want, out) + } + } +} + +// TestCoordinator_DebugFilteredAtInfo verifies that the standard +// LogAttrs path emits nothing when the handler is configured above +// Debug. This is the operational expectation: enabling Info-level +// logging silences the per-chunk traces entirely so production +// throughput is not affected by log overhead. +func TestCoordinator_DebugFilteredAtInfo(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + + log := slog.New(slog.NewTextHandler(&buf, + &slog.HandlerOptions{Level: slog.LevelInfo})) + c := &Coordinator{log: log} + + k := chunk.Key{OriginID: "ox", Bucket: "b", ObjectKey: "o", ChunkSize: 1024, Index: 0} + c.log.LogAttrs(context.Background(), slog.LevelDebug, "catalog_hit", chunkAttrs(k)) + + if buf.Len() != 0 { + t.Errorf("debug emission leaked through Info-level handler: %q", buf.String()) + } +} + +// TestCoordinator_WarnRoutesThroughInjectedHandler verifies that the +// (migrated to LogAttrs) commit-after-serve warning still surfaces +// at Warn level on the injected logger. Regression test for the +// existing call site that pre-dates the debug emissions. +func TestCoordinator_WarnRoutesThroughInjectedHandler(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + + log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelWarn})) + c := &Coordinator{log: log} + + k := chunk.Key{OriginID: "ox", Bucket: "b", ObjectKey: "o", ChunkSize: 1024, Index: 0} + c.log.LogAttrs(context.Background(), slog.LevelWarn, "commit-after-serve failed", + chunkAttrs(k), + slog.String("err", "stub put failure"), + ) + + out := buf.String() + if !strings.Contains(out, "commit-after-serve failed") { + t.Errorf("warning not captured; got %q", out) + } + + if !strings.Contains(out, "chunk.key=o") { + t.Errorf("chunk attribute missing; got %q", out) + } +} + +// fakeOriginForFill returns a fixed body for any GetRange call. +type fakeOriginForFill struct { + body []byte +} + +func (f *fakeOriginForFill) Head(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return origin.ObjectInfo{Size: int64(len(f.body)), ETag: "e1"}, nil +} + +func (f *fakeOriginForFill) GetRange(_ context.Context, _, _, _ string, _, _ int64) (io.ReadCloser, error) { + return io.NopCloser(bytes.NewReader(f.body)), nil +} + +func (f *fakeOriginForFill) List(_ context.Context, _, _, _ string, _ int) (origin.ListResult, error) { + return origin.ListResult{}, nil +} + +// slowPutCacheStore implements cachestore.CacheStore. PutChunk +// blocks until putGate is closed; signals putStarted when entered +// and putReturned when leaving. Used by the commit-after-serve test +// to observe the relative ordering of joiner release vs PutChunk +// completion. +type slowPutCacheStore struct { + putGate chan struct{} + putStarted chan struct{} + putReturned chan struct{} + closeOnce sync.Once + putCallCount atomic.Int64 +} + +func newSlowPutCacheStore() *slowPutCacheStore { + return &slowPutCacheStore{ + putGate: make(chan struct{}), + putStarted: make(chan struct{}), + putReturned: make(chan struct{}), + } +} + +func (s *slowPutCacheStore) GetChunk(_ context.Context, _ chunk.Key, _, _ int64) (io.ReadCloser, error) { + return nil, cachestore.ErrNotFound +} + +func (s *slowPutCacheStore) PutChunk(_ context.Context, _ chunk.Key, _ int64, _ io.Reader) error { + s.putCallCount.Add(1) + s.closeOnce.Do(func() { close(s.putStarted) }) + <-s.putGate + close(s.putReturned) + + return nil +} + +func (s *slowPutCacheStore) Stat(_ context.Context, _ chunk.Key) (cachestore.Info, error) { + return cachestore.Info{}, cachestore.ErrNotFound +} + +func (s *slowPutCacheStore) Delete(_ context.Context, _ chunk.Key) error { return nil } +func (s *slowPutCacheStore) SelfTestAtomicCommit(_ context.Context) error { return nil } + +// TestRunFill_CommitAfterServe_JoinerSeesBytesBeforeCommit verifies +// that runFill releases joiners (close(f.done)) BEFORE the cachestore +// PutChunk completes. With the prior commit-before-serve ordering, +// joiners had to wait an extra commit-rtt; this test detects a +// regression by asserting the joiner returns while PutChunk is still +// blocked. +// +// Regression for H-1. +func TestRunFill_CommitAfterServe_JoinerSeesBytesBeforeCommit(t *testing.T) { + t.Parallel() + + payload := []byte("hello world commit-after-serve test payload!!") + chunkSize := int64(len(payload)) + + or := &fakeOriginForFill{body: payload} + cs := newSlowPutCacheStore() + cat := chunkcatalog.New(64, slog.New(slog.NewTextHandler(io.Discard, nil))) + mc := metadata.NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil) + + cfg := &config.Config{ + Origin: config.Origin{ + ID: "ox", + QueueTimeout: time.Second, + Retry: config.OriginRetry{ + Attempts: 1, + BackoffInitial: time.Millisecond, + BackoffMax: time.Millisecond, + MaxTotalDuration: time.Second, + }, + TargetGlobal: 4, + }, + Cluster: config.Cluster{TargetReplicas: 1}, + } + + co := NewCoordinator(or, cs, nil, cat, mc, cfg, slog.New(slog.NewTextHandler(io.Discard, nil))) + + k := chunk.Key{ + OriginID: "ox", + Bucket: "b", + ObjectKey: "o", + ETag: "e1", + ChunkSize: chunkSize, + Index: 0, + } + + rcCh := make(chan io.ReadCloser, 1) + errCh := make(chan error, 1) + + go func() { + rc, err := co.fillLocal(context.Background(), k, chunkSize) + if err != nil { + errCh <- err + return + } + + rcCh <- rc + }() + // Wait for PutChunk to have been entered, ensuring runFill is + // past the validate-and-release point. + select { + case <-cs.putStarted: + case <-time.After(2 * time.Second): + close(cs.putGate) + t.Fatalf("PutChunk never entered; runFill never reached commit") + } + + // fillLocal should return now (joiner released before PutChunk + // completes). With the old commit-before-serve ordering it would + // still be blocked. + select { + case rc := <-rcCh: + // Verify PutChunk hasn't completed. + select { + case <-cs.putReturned: + t.Errorf("PutChunk returned before fillLocal; commit-after-serve regressed") + default: + } + + got, err := io.ReadAll(rc) + if err != nil { + t.Errorf("read body: %v", err) + } + + if !bytes.Equal(got, payload) { + t.Errorf("body mismatch: got %d bytes want %d", len(got), len(payload)) + } + + _ = rc.Close() //nolint:errcheck // test cleanup + case err := <-errCh: + close(cs.putGate) + t.Fatalf("fillLocal err: %v", err) + case <-time.After(2 * time.Second): + close(cs.putGate) + t.Fatalf("fillLocal didn't return while PutChunk was blocked; commit-after-serve regressed") + } + + // Release PutChunk and let runFill finish. + close(cs.putGate) + <-cs.putReturned +} + +// TestRunFill_ReleaseIdempotent_PanicSafe verifies that close(f.done) +// fires exactly once whether via the explicit success-path call or +// the deferred safety net. A panic mid-fill must not corrupt the +// channel state by double-closing it. +// +// Regression for H-1's sync.Once safety property. +func TestRunFill_ReleaseIdempotent_PanicSafe(t *testing.T) { + t.Parallel() + + // Use the test pattern directly: a sync.Once-wrapped close, + // called from two paths. + done := make(chan struct{}) + + var once sync.Once + + release := func() { once.Do(func() { close(done) }) } + + release() // explicit path + release() // simulated "deferred safety net" path - must not panic + + select { + case <-done: + // Closed - good. + default: + t.Errorf("done channel not closed after release()") + } +} + +// stubOriginEmptyETag returns ObjectInfo with no ETag - simulating a +// misconfigured origin (e.g. some S3-compatible backend without +// versioning, or a custom origin not following the AWS/Azure +// contract). +type stubOriginEmptyETag struct{} + +func (stubOriginEmptyETag) Head(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return origin.ObjectInfo{Size: 1024, ETag: ""}, nil +} + +func (stubOriginEmptyETag) GetRange(_ context.Context, _, _, _ string, _, _ int64) (io.ReadCloser, error) { + return nil, nil +} + +func (stubOriginEmptyETag) List(_ context.Context, _, _, _ string, _ int) (origin.ListResult, error) { + return origin.ListResult{}, nil +} + +// TestHeadObject_RejectsEmptyETag verifies that the coordinator +// rejects an origin Head response with an empty ETag. chunk.Path +// encodes the ETag in its hash; without it, two different versions +// of the same (bucket, key) would alias and serve stale bytes +// silently. +// +// Regression for H-7. +func TestHeadObject_RejectsEmptyETag(t *testing.T) { + t.Parallel() + + mc := metadata.NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil) + co := NewCoordinator(stubOriginEmptyETag{}, nil, nil, nil, mc, + &config.Config{Origin: config.Origin{ID: "ox"}, Cluster: config.Cluster{TargetReplicas: 1}}, + slog.New(slog.NewTextHandler(io.Discard, nil))) + + _, err := co.HeadObject(context.Background(), "b", "o") + if err == nil { + t.Fatalf("HeadObject accepted empty ETag; want MissingETagError") + } + + var mte *origin.MissingETagError + if !errors.As(err, &mte) { + t.Errorf("err type = %T (want *origin.MissingETagError): %v", err, err) + } +} + +// TestHeadObject_EmptyETag_CachedNegatively verifies that a second +// HeadObject call after a MissingETagError result does NOT re-hit +// the origin: the negative result must be cached so we do not +// hammer a misconfigured origin on every request. +func TestHeadObject_EmptyETag_CachedNegatively(t *testing.T) { + t.Parallel() + + or := &countingOrigin{inner: stubOriginEmptyETag{}} + mc := metadata.NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil) + co := NewCoordinator(or, nil, nil, nil, mc, + &config.Config{Origin: config.Origin{ID: "ox"}, Cluster: config.Cluster{TargetReplicas: 1}}, + slog.New(slog.NewTextHandler(io.Discard, nil))) + + for i := 0; i < 3; i++ { + _, err := co.HeadObject(context.Background(), "b", "o") + if err == nil { + t.Errorf("call %d: HeadObject accepted empty ETag", i) + } + } + + if got := or.headCalls.Load(); got != 1 { + t.Errorf("origin.Head invoked %d times; want 1 (negative cached)", got) + } +} + +// countingOrigin wraps an origin.Origin and counts Head invocations. +type countingOrigin struct { + inner origin.Origin + headCalls atomic.Int64 +} + +func (c *countingOrigin) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + c.headCalls.Add(1) + return c.inner.Head(ctx, bucket, key) +} + +func (c *countingOrigin) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) { + return c.inner.GetRange(ctx, bucket, key, etag, off, n) +} + +func (c *countingOrigin) List(ctx context.Context, bucket, prefix, marker string, max int) (origin.ListResult, error) { + return c.inner.List(ctx, bucket, prefix, marker, max) +} diff --git a/internal/orca/inttest/azure_test.go b/internal/orca/inttest/azure_test.go new file mode 100644 index 00000000..5c9ab1dd --- /dev/null +++ b/internal/orca/inttest/azure_test.go @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "bytes" + "context" + "net/http" + "testing" + "time" +) + +// TestAzureBlobOrigin_ColdGet verifies the azureblob origin driver +// works against Azurite end-to-end on a 3-replica cluster. The +// MediumBlob spans 2 chunks so rendezvous-hashed routing typically +// exercises both fillLocal and FillFromPeer in a single run. +func TestAzureBlobOrigin_ColdGet(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second) + defer cancel() + + ctr := pkgAzurite.NewContainer(ctx, t, "orca-origin") + blob := MediumBlob() + SeedAzure(ctx, t, pkgAzurite, ctr, []SeedBlob{blob}) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + Azurite: pkgAzurite, + OriginDriver: "azureblob", + AzureContainer: ctr, + }) + + resp := cl.Get(1).HTTP.Get(ctx, t, ctr, blob.Key) + if resp.Status != http.StatusOK { + t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body)) + } + + if !bytes.Equal(resp.Body, blob.Data) { + t.Fatalf("body mismatch: got %d bytes want %d", len(resp.Body), len(blob.Data)) + } +} diff --git a/internal/orca/inttest/azurite.go b/internal/orca/inttest/azurite.go new file mode 100644 index 00000000..451f81ec --- /dev/null +++ b/internal/orca/inttest/azurite.go @@ -0,0 +1,169 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "crypto/rand" + "encoding/hex" + "fmt" + "testing" + + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/pageblob" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/wait" +) + +// Azurite is a running Azurite container with helper accessors for +// constructing azblob clients pointed at the well-known dev account. +type Azurite struct { + container testcontainers.Container + endpoint string // http://host:port/devstoreaccount1 +} + +// Endpoint returns the Azurite blob-service URL including the +// devstoreaccount1 path segment. +func (az *Azurite) Endpoint() string { return az.endpoint } + +// AccountName returns the well-known Azurite dev account name. +func (az *Azurite) AccountName() string { return azuriteAccountName } + +// AccountKey returns the well-known Azurite dev account key. +func (az *Azurite) AccountKey() string { return azuriteAccountKey } + +// StartAzurite launches an Azurite container and returns once the +// blob-service port is reachable. Caller terminates via Terminate or +// t.Cleanup. +func StartAzurite(ctx context.Context) (*Azurite, error) { + req := testcontainers.ContainerRequest{ + Image: azuriteImage, + ExposedPorts: []string{azuritePort + "/tcp"}, + // `azurite-blob` listens on 0.0.0.0 by default; --skipApiVersionCheck + // keeps the SDK happy for newer client versions. + Cmd: []string{"azurite-blob", "--blobHost", "0.0.0.0", "--skipApiVersionCheck"}, + WaitingFor: wait.ForListeningPort(azuritePort + "/tcp"), + } + + c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: req, + Started: true, + }) + if err != nil { + return nil, fmt.Errorf("start azurite: %w", err) + } + + host, err := c.Host(ctx) + if err != nil { + _ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup + return nil, fmt.Errorf("azurite host: %w", err) + } + + port, err := c.MappedPort(ctx, azuritePort+"/tcp") + if err != nil { + _ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup + return nil, fmt.Errorf("azurite port: %w", err) + } + + endpoint := fmt.Sprintf("http://%s:%s/%s", host, port.Port(), azuriteAccountName) + + return &Azurite{ + container: c, + endpoint: endpoint, + }, nil +} + +// Terminate stops and removes the Azurite container. +func (az *Azurite) Terminate(ctx context.Context) error { + return az.container.Terminate(ctx) +} + +// NewServiceClient returns an azblob.Client authenticated with the +// well-known Azurite dev creds. +func (az *Azurite) NewServiceClient(t *testing.T) *azblob.Client { + t.Helper() + + cred, err := azblob.NewSharedKeyCredential(az.AccountName(), az.AccountKey()) + if err != nil { + t.Fatalf("azurite shared key cred: %v", err) + } + + cli, err := azblob.NewClientWithSharedKeyCredential(az.endpoint, cred, nil) + if err != nil { + t.Fatalf("azurite client: %v", err) + } + + return cli +} + +// NewContainer creates a fresh container and registers a cleanup. The +// container name is returned. +func (az *Azurite) NewContainer(ctx context.Context, t *testing.T, prefix string) string { + t.Helper() + + cli := az.NewServiceClient(t) + name := uniqueName(prefix) + + if _, err := cli.CreateContainer(ctx, name, nil); err != nil { + t.Fatalf("create container %s: %v", name, err) + } + + t.Cleanup(func() { + _, _ = cli.DeleteContainer(context.Background(), name, nil) //nolint:errcheck // best-effort cleanup + }) + + return name +} + +// UploadBlockBlob uploads bytes as a block blob to (container, name). +func (az *Azurite) UploadBlockBlob(ctx context.Context, t *testing.T, ctr, name string, data []byte) { + t.Helper() + + cli := az.NewServiceClient(t) + if _, err := cli.UploadBuffer(ctx, ctr, name, data, nil); err != nil { + t.Fatalf("upload block blob %s/%s: %v", ctr, name, err) + } +} + +// UploadPageBlob uploads bytes as a page blob (used to exercise the +// unsupported-blob-type rejection path in the azureblob driver). Size +// must be a multiple of 512. +func (az *Azurite) UploadPageBlob(ctx context.Context, t *testing.T, ctr, name string, size int64) { + t.Helper() + + cred, err := azblob.NewSharedKeyCredential(az.AccountName(), az.AccountKey()) + if err != nil { + t.Fatalf("azurite shared key cred: %v", err) + } + + containerCli, err := container.NewClientWithSharedKeyCredential( + fmt.Sprintf("%s/%s", az.endpoint, ctr), cred, nil) + if err != nil { + t.Fatalf("container client: %v", err) + } + + pbCli := containerCli.NewPageBlobClient(name) + if _, err := pbCli.Create(ctx, size, &pageblob.CreateOptions{ + HTTPHeaders: &blob.HTTPHeaders{}, + }); err != nil { + t.Fatalf("create page blob: %v", err) + } + // Page blobs created here are zero-filled; tests don't read content + // because the azureblob driver rejects non-Block-Blob types before + // the GET stage. +} + +// uniqueName returns a short random-suffixed name suitable for +// LocalStack buckets and Azurite containers. +func uniqueName(prefix string) string { + var b [4]byte + + _, _ = rand.Read(b[:]) //nolint:errcheck // crypto/rand never fails on linux + + return fmt.Sprintf("%s-%s", prefix, hex.EncodeToString(b[:])) +} diff --git a/internal/orca/inttest/client.go b/internal/orca/inttest/client.go new file mode 100644 index 00000000..78543451 --- /dev/null +++ b/internal/orca/inttest/client.go @@ -0,0 +1,127 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "encoding/xml" + "fmt" + "io" + "net/http" + "testing" +) + +// Client is a thin HTTP wrapper that targets a single replica's edge +// listener and provides typed helpers (GET, GET-Range, HEAD, LIST) for +// test assertions. +type Client struct { + BaseURL string + HTTP *http.Client +} + +// NewClient returns a Client targeting baseURL (e.g. http://127.0.0.1:34567). +func NewClient(baseURL string) *Client { + return &Client{ + BaseURL: baseURL, + HTTP: &http.Client{}, + } +} + +// GetResponse is the result of a GET / HEAD request. +type GetResponse struct { + Status int + Header http.Header + Body []byte +} + +// Get fetches the full body of /bucket/key. +func (c *Client) Get(ctx context.Context, t *testing.T, bucket, key string) GetResponse { + t.Helper() + + return c.do(ctx, t, http.MethodGet, fmt.Sprintf("/%s/%s", bucket, key), nil) +} + +// GetRange fetches a byte range from /bucket/key. +func (c *Client) GetRange(ctx context.Context, t *testing.T, bucket, key string, start, end int64) GetResponse { + t.Helper() + + hdr := http.Header{} + hdr.Set("Range", fmt.Sprintf("bytes=%d-%d", start, end)) + + return c.do(ctx, t, http.MethodGet, fmt.Sprintf("/%s/%s", bucket, key), hdr) +} + +// Head issues a HEAD against /bucket/key. +func (c *Client) Head(ctx context.Context, t *testing.T, bucket, key string) GetResponse { + t.Helper() + + return c.do(ctx, t, http.MethodHead, fmt.Sprintf("/%s/%s", bucket, key), nil) +} + +// ListBucketResult mirrors the (subset) S3 ListObjectsV2 XML response +// shape produced by the orca edge handler. +type ListBucketResult struct { + XMLName xml.Name `xml:"ListBucketResult"` + Name string `xml:"Name"` + Prefix string `xml:"Prefix"` + KeyCount int `xml:"KeyCount"` + Contents []struct { + Key string `xml:"Key"` + Size int64 `xml:"Size"` + ETag string `xml:"ETag"` + } `xml:"Contents"` +} + +// List issues a LIST against /bucket/?list-type=2&prefix=. +func (c *Client) List(ctx context.Context, t *testing.T, bucket, prefix string) ListBucketResult { + t.Helper() + + resp := c.do(ctx, t, http.MethodGet, + fmt.Sprintf("/%s/?list-type=2&prefix=%s", bucket, prefix), nil) + if resp.Status != http.StatusOK { + t.Fatalf("LIST status=%d body=%s", resp.Status, string(resp.Body)) + } + + var out ListBucketResult + if err := xml.Unmarshal(resp.Body, &out); err != nil { + t.Fatalf("LIST decode: %v body=%s", err, string(resp.Body)) + } + + return out +} + +func (c *Client) do(ctx context.Context, t *testing.T, method, path string, hdr http.Header) GetResponse { + t.Helper() + + req, err := http.NewRequestWithContext(ctx, method, c.BaseURL+path, nil) + if err != nil { + t.Fatalf("build request: %v", err) + } + + for k, vs := range hdr { + for _, v := range vs { + req.Header.Add(k, v) + } + } + + resp, err := c.HTTP.Do(req) + if err != nil { + t.Fatalf("%s %s: %v", method, path, err) + } + + defer func() { _ = resp.Body.Close() }() //nolint:errcheck // body close best-effort in tests + + body, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + + return GetResponse{ + Status: resp.StatusCode, + Header: resp.Header, + Body: body, + } +} diff --git a/internal/orca/inttest/doc.go b/internal/orca/inttest/doc.go new file mode 100644 index 00000000..ac83f611 --- /dev/null +++ b/internal/orca/inttest/doc.go @@ -0,0 +1,75 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +// Package inttest contains integration tests for the Orca cache. +// +// Build tag `integrationtest` gates these tests; run via: +// +// make orca-inttest +// +// Equivalent to: +// +// go test -tags=integrationtest -race -timeout 15m \ +// ./internal/orca/inttest/... +// +// # Architecture +// +// The harness brings up real LocalStack and Azurite containers via +// testcontainers-go and constructs N in-process *app.App instances +// wired to those containers. By default StartCluster runs 3 replicas, +// matching the production deploy/orca topology. +// +// Every replica binds to 127.0.0.1 with an OS-assigned distinct +// internal port; the cluster.Peer struct now carries an explicit Port +// (zero in production, set in tests) and FillFromPeer dials peer.IP + +// peer.Port. This lets multi-replica tests run on every platform +// (Linux, macOS, Windows / WSL) without loopback-alias setup. +// +// Each replica owns its own StaticPeerSource (cluster.PeerSource). +// Tests that need to induce membership disagreement mutate one +// replica's source; the cluster's refresh goroutine picks up the +// change within MembershipRefresh (250 ms in tests). +// +// # Container lifecycle +// +// TestMain starts one LocalStack and one Azurite container per +// `go test` invocation; per-test buckets/containers prevent +// cross-test interference. +// +// # File layout +// +// - e2e_test.go - the canonical end-to-end suite (3 replicas). +// Boot-self-test, cold/warm GET, ranged GET, multi-chunk GET, +// LIST, HEAD, NotFound, rendezvous coordinator routing, +// singleflight collapse, peer-not-coordinator fallback (real). +// - azure_test.go - azureblob origin driver smoke against Azurite +// (3 replicas). +// +// Driver-level branch coverage (versioning gate, blob-type +// rejection) lives as fast unit tests in the respective driver +// packages (cachestore/s3, origin/azureblob), not here. +// +// # Adding a scenario +// +// 1. Pick the right entry point: StartCluster (3-replica default). +// Tests that need to assert on a boot-time failure mode that +// surfaces before any chunk fetch (versioning gate, blob-type +// rejection, etc.) should live as unit tests in the respective +// driver package. +// 2. Seed the origin: SeedS3 or SeedAzure. +// 3. Issue requests via cl.Get(i).HTTP.Get / GetRange / Head / List. +// 4. Assert byte-exact body, status code, and (where relevant) origin +// RPC counts via the optional CountingOrigin or peer 409 counts via +// CountingInternalHandlerWrap. +// +// # TODO (genuinely future work) +// +// - TestEtagChange (mid-fill mutation): requires a deterministic +// test seam in fetch.Coordinator (e.g. a hook that pauses between +// chunk fetches) so the test can rewrite the origin object +// between chunk 0 and chunk 1 of the same fill. +// - Fault-injection origin / cachestore decorators: useful for +// timeout, throttle, and 5xx retry-budget assertions. +package inttest diff --git a/internal/orca/inttest/e2e_test.go b/internal/orca/inttest/e2e_test.go new file mode 100644 index 00000000..c384fc61 --- /dev/null +++ b/internal/orca/inttest/e2e_test.go @@ -0,0 +1,496 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "bytes" + "context" + "net/http" + "strconv" + "sync" + "testing" + "time" + + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/cluster" +) + +// e2e_test.go is the canonical end-to-end suite for orca: every +// scenario runs against a 3-replica in-process cluster pointed at +// LocalStack. Tests that exercise chunk fetching naturally exercise +// both the local-fill path (when self happens to win rendezvous for +// a chunk) and the cross-replica /internal/fill path (when a peer +// wins). +// +// Driver-level branch coverage (versioning gate, blob-type rejection, +// HTTP error mapping, range parsing, chunk arithmetic, config env +// fallback) lives as fast unit tests in the respective driver / server +// / chunk / config packages. The scenarios here are reserved for +// behavior that can only be verified end-to-end against real +// LocalStack (or Azurite, in azure_test.go) plus a real cluster of +// in-process orca instances. + +// TestColdAndWarmGet exercises GET twice for the same single-chunk +// blob: cold (origin fetch + cache commit) and warm (cachestore hit). +// The warm phase deletes the origin object first to prove the cache +// hit really happened. +func TestColdAndWarmGet(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 60*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + blob := SmallBlob() + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob}) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + }) + + cold := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key) + if cold.Status != http.StatusOK { + t.Fatalf("cold status=%d body=%s", cold.Status, string(cold.Body)) + } + + if !bytes.Equal(cold.Body, blob.Data) { + t.Fatalf("cold body mismatch: got %d bytes, want %d", len(cold.Body), len(blob.Data)) + } + + if cold.Header.Get("ETag") == "" { + t.Errorf("expected ETag header on cold GET") + } + + DeleteS3Object(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, blob.Key) + + warm := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key) + if warm.Status != http.StatusOK { + t.Fatalf("warm status=%d body=%s", warm.Status, string(warm.Body)) + } + + if !bytes.Equal(warm.Body, blob.Data) { + t.Fatalf("warm body mismatch: got %d bytes, want %d", len(warm.Body), len(blob.Data)) + } +} + +// TestRangedGet verifies byte-range requests return 206 + +// Content-Range + the requested slice. Covers within-chunk, +// cross-chunk, and (against a 64-chunk blob) various boundary edge +// cases. The chunk-arithmetic branches are unit-tested separately in +// internal/orca/chunk; this verifies the end-to-end HTTP Range +// round-trip with real chunk bodies. +func TestRangedGet(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + medium := MediumBlob() // 1.5 MiB == 2 chunks at 1 MiB + huge := HugeBlob() // 64 MiB == 64 chunks at 1 MiB + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{medium, huge}) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + }) + + resp := cl.Get(1).HTTP.GetRange(ctx, t, bucket, medium.Key, 100, 199) + if resp.Status != http.StatusPartialContent { + t.Fatalf("status=%d (want 206)", resp.Status) + } + + if cr := resp.Header.Get("Content-Range"); cr == "" { + t.Errorf("expected Content-Range header") + } + + want := medium.Data[100:200] + if !bytes.Equal(resp.Body, want) { + t.Fatalf("range body mismatch: got %d bytes, want %d", len(resp.Body), len(want)) + } + + chunkSize := int64(1024 * 1024) + resp2 := cl.Get(1).HTTP.GetRange(ctx, t, bucket, medium.Key, chunkSize-50, chunkSize+49) + + if resp2.Status != http.StatusPartialContent { + t.Fatalf("cross-chunk status=%d (want 206)", resp2.Status) + } + + want2 := medium.Data[chunkSize-50 : chunkSize+50] + if !bytes.Equal(resp2.Body, want2) { + t.Fatalf("cross-chunk range mismatch: got %d bytes, want %d", len(resp2.Body), len(want2)) + } + + t.Run("huge blob boundary cases", func(t *testing.T) { + const chunk = int64(1024 * 1024) + + cases := []struct { + name string + start, end int64 + }{ + {"starts exactly at chunk boundary 32", 32 * chunk, 32*chunk + 100}, + {"ends exactly at chunk boundary 47", 48*chunk - 100, 48*chunk - 1}, + {"covers chunks 10-12 (3 contiguous full chunks)", 10 * chunk, 13*chunk - 1}, + {"straddles 5 consecutive boundaries (chunks 20-25)", 20*chunk + 100, 25*chunk + 200}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + rr := cl.Get(1).HTTP.GetRange(ctx, t, bucket, huge.Key, tc.start, tc.end) + if rr.Status != http.StatusPartialContent { + t.Fatalf("status=%d (want 206)", rr.Status) + } + + expected := huge.Data[tc.start : tc.end+1] + if !bytes.Equal(rr.Body, expected) { + t.Fatalf("body mismatch: got %d bytes, want %d", len(rr.Body), len(expected)) + } + }) + } + }) +} + +// TestMultiChunkGet verifies a full GET of a 64-chunk blob assembles +// correctly across chunk boundaries. With 3 replicas and 64 chunks, +// rendezvous-hashed coordinator selection statistically guarantees +// every replica is the coordinator for many chunks, so this test +// exercises both fillLocal and FillFromPeer paths thoroughly in a +// single run. +func TestMultiChunkGet(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + blob := HugeBlob() + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob}) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + }) + + resp := cl.Get(1).HTTP.Get(ctx, t, bucket, blob.Key) + if resp.Status != http.StatusOK { + t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body)) + } + + if !bytes.Equal(resp.Body, blob.Data) { + t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data)) + } +} + +// TestRendezvousCoordinatorRouting verifies that a GET against a +// non-coordinator replica routes through /internal/fill to the +// coordinator and still returns the body. The CountingOrigin +// decorator confirms exactly one origin GetRange happened across the +// cluster (the coordinator's). +func TestRendezvousCoordinatorRouting(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + blob := SmallBlob() + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob}) + + count := newCountingOriginForLocalStack(ctx, t, bucket) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + OriginOverride: count, + }) + + headResp := cl.Get(1).HTTP.Head(ctx, t, bucket, blob.Key) + + etag := stripQuotes(headResp.Header.Get("ETag")) + if etag == "" { + t.Fatalf("HEAD returned empty ETag: %+v", headResp.Header) + } + + k := chunk.Key{ + OriginID: "inttest-origin", + Bucket: bucket, + ObjectKey: blob.Key, + ETag: etag, + ChunkSize: int64(1024 * 1024), + Index: 0, + } + coord := cl.Get(1).App.Cluster.Coordinator(k) + + var nonCoord *Replica + + for _, r := range cl.Replicas { + if r.SelfIP != coord.IP || r.InternalPort != coord.Port { + nonCoord = r + break + } + } + + if nonCoord == nil { + t.Fatalf("could not find a non-coordinator replica; coord=%+v peers=%+v", + coord, cl.Get(1).App.Cluster.Peers()) + } + + count.Reset() + + resp := nonCoord.HTTP.Get(ctx, t, bucket, blob.Key) + if resp.Status != http.StatusOK { + t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body)) + } + + if !bytes.Equal(resp.Body, blob.Data) { + t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data)) + } + // Exactly one HEAD (HeadObject metadata cache) plus one GetRange + // (single chunk fetch). Cluster-wide dedup must not produce more. + if got := count.GetRanges(); got != 1 { + t.Errorf("origin GetRange count=%d (want 1)", got) + } +} + +// TestSingleflightCollapse fires N concurrent GETs (one per replica) +// for the same key and asserts the origin saw exactly one GetRange +// per chunk (cluster-wide singleflight collapse). +func TestSingleflightCollapse(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 120*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + blob := HugeBlob() // 64 chunks + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob}) + + count := newCountingOriginForLocalStack(ctx, t, bucket) + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + OriginOverride: count, + }) + + count.Reset() + + var wg sync.WaitGroup + + wg.Add(cl.Len()) + + results := make([][]byte, cl.Len()) + statuses := make([]int, cl.Len()) + + for i := 1; i <= cl.Len(); i++ { + go func(i int) { + defer wg.Done() + + r := cl.Get(i).HTTP.Get(ctx, t, bucket, blob.Key) + results[i-1] = r.Body + statuses[i-1] = r.Status + }(i) + } + + wg.Wait() + + for i, s := range statuses { + if s != http.StatusOK { + t.Fatalf("replica %d status=%d", i+1, s) + } + + if !bytes.Equal(results[i], blob.Data) { + t.Fatalf("replica %d body mismatch: got %d bytes want %d", i+1, len(results[i]), len(blob.Data)) + } + } + // HugeBlob spans 64 chunks; cluster-wide singleflight should + // dedupe each chunk to exactly one origin GetRange. Allow up to + // 76 (~20% slack) to absorb timing-dependent races where a + // joiner arrives during in-flight commit. + if got := count.GetRanges(); got > 76 { + t.Errorf("origin GetRange count=%d (want <= 76 for 64-chunk blob)", got) + } + + if got := count.GetRanges(); got < 64 { + t.Errorf("origin GetRange count=%d (want >= 64 for 64-chunk cold fill)", got) + } +} + +// TestPeerNotCoordinatorFallback induces real membership disagreement +// and asserts the coordinator's /internal/fill returns 409 and the +// requesting replica's local-fill fallback succeeds. +// +// Setup: +// +// - 3-replica cluster with shared CountingInternalHandlerWrap so we +// can read 409 counts per receiving replica. +// - HEAD the seeded blob to learn ETag; compute Coordinator(k) for +// chunk 0 from replica 1's view (call it C). +// - Craft a phantom peer P (an unreachable IP/Port pair) whose +// rendezvous score for k is higher than C's. Mutate C's peer +// source to include P plus C itself; now C.IsCoordinator(k) +// returns false because P wins. +// - Find another replica R whose view still says C is the +// coordinator. GET via R. +// +// Expected: +// +// - R issues /internal/fill to C. +// - C responds 409 (its IsCoordinator returns false because P wins). +// - R falls through to fillLocal, fetches the origin, serves the +// body. +// - counter.Count(C, 409) >= 1. +func TestPeerNotCoordinatorFallback(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 90*time.Second) + defer cancel() + + bucket := pkgLocalStack.NewBucket(ctx, t, "orca-origin") + blob := SmallBlob() + SeedS3(ctx, t, pkgLocalStack.NewS3Client(ctx, t), bucket, []SeedBlob{blob}) + + wrap := NewCountingInternalHandlerWrap() + + cl := StartCluster(ctx, t, ClusterOptions{ + LocalStack: pkgLocalStack, + OriginBucket: bucket, + InternalHandlerWrap: wrap, + }) + + headResp := cl.Get(1).HTTP.Head(ctx, t, bucket, blob.Key) + + etag := stripQuotes(headResp.Header.Get("ETag")) + if etag == "" { + t.Fatalf("HEAD returned empty ETag: %+v", headResp.Header) + } + + k := chunk.Key{ + OriginID: "inttest-origin", + Bucket: bucket, + ObjectKey: blob.Key, + ETag: etag, + ChunkSize: int64(1024 * 1024), + Index: 0, + } + coord := cl.Get(1).App.Cluster.Coordinator(k) + + coordReplica := cl.FindBySelfIPPort(coord.IP, coord.Port) + if coordReplica == nil { + t.Fatalf("coord %+v not found among replicas", coord) + } + + // Craft a phantom peer whose rendezvous score beats coord's for k. + // The phantom's IP/Port don't need to be reachable; it's never + // dialed, only used to skew rendezvous on coord's view. + pathBytes := []byte(k.Path()) + coordScore := cluster.Score(coord, pathBytes) + phantom := cluster.Peer{IP: "203.0.113.1"} // TEST-NET-3, unreachable + + for port := 1; port < 65536; port++ { + phantom.Port = port + if cluster.Score(phantom, pathBytes) > coordScore { + break + } + } + + if cluster.Score(phantom, pathBytes) <= coordScore { + t.Fatalf("could not find a phantom peer beating coord rendezvous score") + } + + // Build coord's new peer-set: original real peers plus the + // phantom. The StaticPeerSource will stamp Self=true only on the + // peer matching coord's (selfIP, selfPort), so coord still + // recognizes itself; but the phantom wins rendezvous, so + // coord.IsCoordinator(k) flips to false. + newPeers := make([]cluster.Peer, 0, cl.Len()+1) + for _, r := range cl.Replicas { + newPeers = append(newPeers, cluster.Peer{IP: r.SelfIP, Port: r.InternalPort}) + } + + newPeers = append(newPeers, phantom) + coordReplica.PeerSource.SetPeers(newPeers) + + if err := waitForCondition(ctx, 2*time.Second, func() bool { + return !coordReplica.App.Cluster.IsCoordinator(k) + }); err != nil { + t.Fatalf("coord did not relinquish coordinator status: %v", err) + } + // Find a replica R whose view still says coord is the coordinator. + var requester *Replica + + for _, r := range cl.Replicas { + if r == coordReplica { + continue + } + + rc := r.App.Cluster.Coordinator(k) + if rc.IP == coord.IP && rc.Port == coord.Port { + requester = r + break + } + } + + if requester == nil { + t.Fatalf("no non-coord replica still views coord %+v as coordinator", coord) + } + + resp := requester.HTTP.Get(ctx, t, bucket, blob.Key) + if resp.Status != http.StatusOK { + t.Fatalf("status=%d body=%s", resp.Status, string(resp.Body)) + } + + if !bytes.Equal(resp.Body, blob.Data) { + t.Fatalf("body mismatch: got %d bytes, want %d", len(resp.Body), len(blob.Data)) + } + + coordKey := coord.IP + ":" + strconv.Itoa(coord.Port) + if got := wrap.Count(coordKey, http.StatusConflict); got < 1 { + t.Fatalf("expected at least one 409 from coord %s; got %d", + coordKey, got) + } +} + +func newCountingOriginForLocalStack(ctx context.Context, t *testing.T, bucket string) *CountingOrigin { + t.Helper() + + or, err := localStackOrigin(ctx, t, bucket) + if err != nil { + t.Fatalf("localStackOrigin: %v", err) + } + + return NewCountingOrigin(or) +} + +func stripQuotes(s string) string { + if len(s) >= 2 && s[0] == '"' && s[len(s)-1] == '"' { + return s[1 : len(s)-1] + } + + return s +} + +func waitForCondition(ctx context.Context, dl time.Duration, cond func() bool) error { + deadline := time.Now().Add(dl) + for time.Now().Before(deadline) { + if cond() { + return nil + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(25 * time.Millisecond): + } + } + + if cond() { + return nil + } + + return context.DeadlineExceeded +} diff --git a/internal/orca/inttest/harness.go b/internal/orca/inttest/harness.go new file mode 100644 index 00000000..48a99c92 --- /dev/null +++ b/internal/orca/inttest/harness.go @@ -0,0 +1,378 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "fmt" + "io" + "log/slog" + "net" + "strconv" + "testing" + "time" + + "github.com/Azure/unbounded/internal/orca/app" + "github.com/Azure/unbounded/internal/orca/cachestore" + "github.com/Azure/unbounded/internal/orca/cluster" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// ClusterOptions controls Harness.StartCluster. +type ClusterOptions struct { + // Replicas is the number of in-process orca instances. Defaults + // to 3 when zero, matching the production deploy/orca topology. + Replicas int + + // ChunkSize is the per-chunk byte count. The orca config validator + // enforces a 1 MiB minimum; tests typically use 1 MiB to keep test + // blob sizes manageable while still spanning multiple chunks. + ChunkSize int64 + + // OriginID is the logical origin identifier (echoed in chunk paths). + OriginID string + + // OriginBucket is the bucket on the origin LocalStack/Azurite. + OriginBucket string + + // OriginDriver is "awss3" (default) or "azureblob". + OriginDriver string + + // LocalStack is the LocalStack handle used for origin (when + // OriginDriver=="awss3") and always for cachestore. + LocalStack *LocalStack + + // Azurite is required when OriginDriver=="azureblob". + Azurite *Azurite + + // AzureContainer is the Azurite container name for the origin. + AzureContainer string + + // CachestoreBucket is the bucket on LocalStack used as the orca + // cachestore. If empty, a fresh bucket is allocated. + CachestoreBucket string + + // OriginOverride, when set, replaces the constructed origin driver. + // Used to wire CountingOrigin around the real client. + OriginOverride origin.Origin + + // CacheStoreOverride, when set, replaces the constructed cachestore + // driver. + CacheStoreOverride cachestore.CacheStore + + // InternalHandlerWrap, when set, is registered with each replica's + // app.WithInternalHandlerWrap. Tests use this to install a 409 + // counter (CountingInternalHandlerWrap.WrapFor). + InternalHandlerWrap *CountingInternalHandlerWrap +} + +// Replica represents one running *app.App in the harness. +type Replica struct { + App *app.App + SelfIP string + InternalPort int + PeerSource *StaticPeerSource + HTTP *Client // pre-built client targeting this replica's edge +} + +// Cluster is a collection of Replicas plus the harness-owned context. +type Cluster struct { + Replicas []*Replica +} + +// Get returns replica i (1-indexed). +func (c *Cluster) Get(i int) *Replica { return c.Replicas[i-1] } + +// Len returns the replica count. +func (c *Cluster) Len() int { return len(c.Replicas) } + +// FindBySelfIPPort returns the replica whose (SelfIP, InternalPort) +// matches the given peer; nil if none. +func (c *Cluster) FindBySelfIPPort(ip string, port int) *Replica { + for _, r := range c.Replicas { + if r.SelfIP == ip && r.InternalPort == port { + return r + } + } + + return nil +} + +// StartCluster brings up `opts.Replicas` orca instances (default 3) +// pointed at the origin/cachestore described in opts. Every replica +// binds to 127.0.0.1 with an OS-assigned distinct internal port; one +// StaticPeerSource per replica is initialized with the full peer set +// (with explicit ports). Tests can mutate any replica's PeerSource +// independently. +// +// Cleanup (Shutdown of each app) is registered with t.Cleanup. +func StartCluster(ctx context.Context, t *testing.T, opts ClusterOptions) *Cluster { + t.Helper() + + if opts.Replicas == 0 { + opts.Replicas = 3 + } + + if opts.Replicas < 1 { + t.Fatalf("StartCluster: Replicas must be >= 1, got %d", opts.Replicas) + } + + if opts.ChunkSize == 0 { + opts.ChunkSize = 1024 * 1024 + } + + if opts.OriginDriver == "" { + opts.OriginDriver = "awss3" + } + + if opts.OriginID == "" { + opts.OriginID = "inttest-origin" + } + + if opts.LocalStack == nil { + t.Fatal("StartCluster: LocalStack handle required") + } + + if opts.OriginDriver == "azureblob" { + if opts.Azurite == nil { + t.Fatal("StartCluster: Azurite handle required for azureblob driver") + } + + if opts.AzureContainer == "" { + t.Fatal("StartCluster: AzureContainer required for azureblob driver") + } + } + + if opts.OriginBucket == "" && opts.OriginDriver == "awss3" { + t.Fatal("StartCluster: OriginBucket required for awss3 driver") + } + + cacheBucket := opts.CachestoreBucket + if cacheBucket == "" { + cacheBucket = opts.LocalStack.NewBucket(ctx, t, "orca-cache") + } + + // Allocate per-replica internal listeners up front (open) so each + // replica's peer source can advertise the full set with explicit + // ports from t=0. We hand the open listeners to app.Start via + // WithInternalListener/WithEdgeListener/WithOpsListener so there + // is no close-and-rebind window for races with concurrent tests. + internalListeners := make([]net.Listener, opts.Replicas) + internalPorts := make([]int, opts.Replicas) + edgeListeners := make([]net.Listener, opts.Replicas) + opsListeners := make([]net.Listener, opts.Replicas) + + for i := range internalListeners { + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + closeListeners(internalListeners) + closeListeners(edgeListeners) + closeListeners(opsListeners) + t.Fatalf("alloc internal port for replica %d: %v", i+1, err) + } + + internalListeners[i] = ln + internalPorts[i] = ln.Addr().(*net.TCPAddr).Port //nolint:errcheck // *net.TCPAddr from net.Listen + + eln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + closeListeners(internalListeners) + closeListeners(edgeListeners) + closeListeners(opsListeners) + t.Fatalf("alloc edge port for replica %d: %v", i+1, err) + } + + edgeListeners[i] = eln + + oln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + closeListeners(internalListeners) + closeListeners(edgeListeners) + closeListeners(opsListeners) + t.Fatalf("alloc ops port for replica %d: %v", i+1, err) + } + + opsListeners[i] = oln + } + + allPeers := make([]cluster.Peer, opts.Replicas) + for i := range allPeers { + allPeers[i] = cluster.Peer{ + IP: "127.0.0.1", + Port: internalPorts[i], + } + } + + cl := &Cluster{} + + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + + for i := 0; i < opts.Replicas; i++ { + selfIP := "127.0.0.1" + selfPort := internalPorts[i] + ps := NewStaticPeerSource(selfIP, selfPort, allPeers) + + cfg := buildConfig(opts, cacheBucket) + cfg.Cluster.SelfPodIP = selfIP + cfg.Cluster.InternalListen = net.JoinHostPort(selfIP, strconv.Itoa(selfPort)) + cfg.Server.Listen = edgeListeners[i].Addr().String() + + appOpts := []app.Option{ + app.WithLogger(logger), + app.WithPeerSource(ps), + app.WithEdgeListener(edgeListeners[i]), + app.WithInternalListener(internalListeners[i]), + app.WithOpsListener(opsListeners[i]), + } + + if opts.OriginOverride != nil { + appOpts = append(appOpts, app.WithOrigin(opts.OriginOverride)) + } + + if opts.CacheStoreOverride != nil { + appOpts = append(appOpts, app.WithCacheStore(opts.CacheStoreOverride)) + } + + if opts.InternalHandlerWrap != nil { + appOpts = append(appOpts, app.WithInternalHandlerWrap(opts.InternalHandlerWrap.WrapFor(selfIP+":"+strconv.Itoa(selfPort)))) + } + + a, err := app.Start(ctx, cfg, appOpts...) + if err != nil { + t.Fatalf("app.Start replica %d: %v", i+1, err) + } + + r := &Replica{ + App: a, + SelfIP: selfIP, + InternalPort: selfPort, + PeerSource: ps, + HTTP: NewClient("http://" + a.EdgeAddr), + } + cl.Replicas = append(cl.Replicas, r) + + t.Cleanup(func() { + ctxShut, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + _ = a.Shutdown(ctxShut) //nolint:errcheck // shutdown logs already emitted + }) + } + // Wait for every replica's Cluster.Peers() to converge to the + // full set. + if err := waitForPeers(ctx, cl, opts.Replicas, 2*time.Second); err != nil { + t.Fatalf("waitForPeers: %v", err) + } + + return cl +} + +func buildConfig(opts ClusterOptions, cacheBucket string) *config.Config { + cfg := &config.Config{ + Server: config.Server{ + Listen: "127.0.0.1:0", + Auth: config.ServerAuth{Enabled: false}, + }, + Origin: config.Origin{ + ID: opts.OriginID, + Driver: opts.OriginDriver, + TargetGlobal: 32, + QueueTimeout: 5 * time.Second, + Retry: config.OriginRetry{ + Attempts: 2, + BackoffInitial: 10 * time.Millisecond, + BackoffMax: 50 * time.Millisecond, + MaxTotalDuration: 2 * time.Second, + }, + }, + Cachestore: config.Cachestore{ + Driver: "s3", + S3: config.CachestoreS3{ + Endpoint: opts.LocalStack.Endpoint(), + Bucket: cacheBucket, + Region: opts.LocalStack.Region(), + AccessKey: opts.LocalStack.AccessKey(), + SecretKey: opts.LocalStack.SecretKey(), + UsePathStyle: true, + }, + }, + Cluster: config.Cluster{ + Service: "orca-peers.test.svc.cluster.local", + MembershipRefresh: 250 * time.Millisecond, + InternalListen: "127.0.0.1:0", // overridden per replica + InternalTLS: config.InternalTLS{Enabled: false}, + TargetReplicas: opts.Replicas, + SelfPodIP: "127.0.0.1", // overridden per replica + }, + ChunkCatalog: config.ChunkCatalog{MaxEntries: 1024}, + Metadata: config.Metadata{ + TTL: 5 * time.Minute, + NegativeTTL: 5 * time.Second, + MaxEntries: 1024, + }, + Chunking: config.Chunking{Size: opts.ChunkSize}, + } + + switch opts.OriginDriver { + case "awss3": + cfg.Origin.AWSS3 = config.AWSS3{ + Endpoint: opts.LocalStack.Endpoint(), + Region: opts.LocalStack.Region(), + Bucket: opts.OriginBucket, + AccessKey: opts.LocalStack.AccessKey(), + SecretKey: opts.LocalStack.SecretKey(), + UsePathStyle: true, + } + case "azureblob": + cfg.Origin.Azureblob = config.Azureblob{ + Account: opts.Azurite.AccountName(), + AccountKey: opts.Azurite.AccountKey(), + Container: opts.AzureContainer, + Endpoint: opts.Azurite.Endpoint(), + } + } + + return cfg +} + +// waitForPeers polls each replica's cluster.Peers() until every +// replica has at least the expected count or the deadline elapses. +func waitForPeers(ctx context.Context, cl *Cluster, want int, dl time.Duration) error { + deadline := time.Now().Add(dl) + + for time.Now().Before(deadline) { + ok := true + + for _, r := range cl.Replicas { + if len(r.App.Cluster.Peers()) < want { + ok = false + break + } + } + + if ok { + return nil + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(50 * time.Millisecond): + } + } + + return fmt.Errorf("peer-set did not converge to %d on all %d replicas within %s", + want, len(cl.Replicas), dl) +} + +func closeListeners(lns []net.Listener) { + for _, ln := range lns { + if ln != nil { + _ = ln.Close() //nolint:errcheck // best-effort cleanup + } + } +} diff --git a/internal/orca/inttest/images.go b/internal/orca/inttest/images.go new file mode 100644 index 00000000..d90aaba9 --- /dev/null +++ b/internal/orca/inttest/images.go @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +// Pinned container image tags. Bump centrally when upgrading. +const ( + // localstackImage is the LocalStack image used for both the origin + // (awss3) and cachestore (s3) backends. Pinned to 3.8 because + // later LocalStack tags require the AWS SDK CRC64NVME checksum + // opt-out (which the cachestore/s3 driver and this harness's S3 + // client builder both apply). + localstackImage = "localstack/localstack:3.8" + + // azuriteImage is the Azurite (Azure Blob emulator) image. We pin + // to a specific minor for reproducibility. + azuriteImage = "mcr.microsoft.com/azure-storage/azurite:3.34.0" + + // azuritePort is the blob-service port published by Azurite. + azuritePort = "10000" + + // azuriteAccountName is the well-known Azurite dev account. + azuriteAccountName = "devstoreaccount1" + + // azuriteAccountKey is the well-known Azurite dev account key. It + // is hard-coded by the emulator; not a secret. + azuriteAccountKey = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" +) diff --git a/internal/orca/inttest/internalwrap.go b/internal/orca/inttest/internalwrap.go new file mode 100644 index 00000000..78d29233 --- /dev/null +++ b/internal/orca/inttest/internalwrap.go @@ -0,0 +1,145 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "net/http" + "sync" + "sync/atomic" +) + +// CountingInternalHandlerWrap is an http.Handler decorator factory +// that counts response status codes per receiving replica IP. Used +// by TestPeerNotCoordinatorFallback to assert a peer's +// /internal/fill handler returned 409 (proving the cluster.go 409 +// fallback path actually fired on the requesting replica). +// +// One CountingInternalHandlerWrap is shared across all replicas in +// the harness; each replica's wrapped handler stamps its self IP +// onto the response writer so counts can be attributed back. +type CountingInternalHandlerWrap struct { + mu sync.Mutex + counts map[string]map[int]*atomic.Int64 // selfIP -> status -> count + defined map[string]struct{} +} + +// NewCountingInternalHandlerWrap returns an empty wrapper. +func NewCountingInternalHandlerWrap() *CountingInternalHandlerWrap { + return &CountingInternalHandlerWrap{ + counts: make(map[string]map[int]*atomic.Int64), + defined: make(map[string]struct{}), + } +} + +// WrapFor returns a wrap function suitable for app.WithInternalHandlerWrap +// that attributes status-code counts back to the named selfIP. +func (w *CountingInternalHandlerWrap) WrapFor(selfIP string) func(http.Handler) http.Handler { + w.mu.Lock() + if _, ok := w.counts[selfIP]; !ok { + w.counts[selfIP] = make(map[int]*atomic.Int64) + } + + w.defined[selfIP] = struct{}{} + w.mu.Unlock() + + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + cw := &countingResponseWriter{ResponseWriter: rw, status: http.StatusOK} + next.ServeHTTP(cw, req) + w.record(selfIP, cw.status) + }) + } +} + +// Count returns the number of responses with the given status code +// observed at the named selfIP. +func (w *CountingInternalHandlerWrap) Count(selfIP string, status int) int64 { + w.mu.Lock() + defer w.mu.Unlock() + + byStatus, ok := w.counts[selfIP] + if !ok { + return 0 + } + + c, ok := byStatus[status] + if !ok { + return 0 + } + + return c.Load() +} + +// CountAcross returns the count summed across all known selfIPs. +func (w *CountingInternalHandlerWrap) CountAcross(status int) int64 { + w.mu.Lock() + defer w.mu.Unlock() + + var total int64 + + for _, byStatus := range w.counts { + if c, ok := byStatus[status]; ok { + total += c.Load() + } + } + + return total +} + +func (w *CountingInternalHandlerWrap) record(selfIP string, status int) { + w.mu.Lock() + + byStatus, ok := w.counts[selfIP] + if !ok { + byStatus = make(map[int]*atomic.Int64) + w.counts[selfIP] = byStatus + } + + c, ok := byStatus[status] + if !ok { + c = &atomic.Int64{} + byStatus[status] = c + } + + w.mu.Unlock() + c.Add(1) +} + +// countingResponseWriter records the first WriteHeader status; if no +// WriteHeader is ever called, http.StatusOK is recorded (matching the +// net/http default). +type countingResponseWriter struct { + http.ResponseWriter + status int + wroteHeader bool +} + +func (c *countingResponseWriter) WriteHeader(status int) { + if !c.wroteHeader { + c.status = status + c.wroteHeader = true + } + + c.ResponseWriter.WriteHeader(status) +} + +func (c *countingResponseWriter) Write(p []byte) (int, error) { + if !c.wroteHeader { + c.wroteHeader = true + } + + return c.ResponseWriter.Write(p) +} + +// Flush passes through to the embedded ResponseWriter when it +// implements http.Flusher. Without this method, wrapping a handler +// that streams via Flush() (e.g. the edge handler's per-chunk +// f.Flush()) would silently degrade to buffered responses. +func (c *countingResponseWriter) Flush() { + if fl, ok := c.ResponseWriter.(http.Flusher); ok { + fl.Flush() + } +} diff --git a/internal/orca/inttest/localstack.go b/internal/orca/inttest/localstack.go new file mode 100644 index 00000000..5abb404d --- /dev/null +++ b/internal/orca/inttest/localstack.go @@ -0,0 +1,180 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "fmt" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/wait" +) + +// LocalStack is a running LocalStack container with helper accessors +// for constructing AWS S3 clients pointed at it. Use NewS3Client to +// get a configured client; use NewBucket to allocate a fresh bucket +// for a single test. +type LocalStack struct { + container testcontainers.Container + endpoint string + region string +} + +// AccessKey returns the LocalStack-default access key. LocalStack does +// not validate credentials but the AWS SDK requires non-empty values. +func (ls *LocalStack) AccessKey() string { return "test" } + +// SecretKey returns the LocalStack-default secret key. +func (ls *LocalStack) SecretKey() string { return "test" } + +// Endpoint returns the http:// URL of the LocalStack edge port. +func (ls *LocalStack) Endpoint() string { return ls.endpoint } + +// Region returns the static region the harness uses with LocalStack. +func (ls *LocalStack) Region() string { return ls.region } + +// StartLocalStack launches a LocalStack container and returns a handle +// once the edge port is healthy. Caller is responsible for terminating +// the container (via container.Terminate or t.Cleanup). +func StartLocalStack(ctx context.Context) (*LocalStack, error) { + req := testcontainers.ContainerRequest{ + Image: localstackImage, + ExposedPorts: []string{"4566/tcp"}, + Env: map[string]string{ + "SERVICES": "s3", + // LocalStack 3.8 returns InvalidRequest on the SDK's + // CRC64NVME default checksum. The orca s3 driver opts out + // at the SDK config level, but seeding clients in tests + // must do the same. We set the variables both in the + // container env (for any in-container tooling) and on the + // SDK config in NewS3Client. + "S3_SKIP_SIGNATURE_VALIDATION": "1", + }, + WaitingFor: wait.ForHTTP("/_localstack/health"). + WithPort("4566/tcp"). + WithStatusCodeMatcher(func(status int) bool { return status == 200 }), + } + + c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{ + ContainerRequest: req, + Started: true, + }) + if err != nil { + return nil, fmt.Errorf("start localstack: %w", err) + } + + host, err := c.Host(ctx) + if err != nil { + _ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup + return nil, fmt.Errorf("localstack host: %w", err) + } + + port, err := c.MappedPort(ctx, "4566/tcp") + if err != nil { + _ = c.Terminate(ctx) //nolint:errcheck // best-effort cleanup + return nil, fmt.Errorf("localstack port: %w", err) + } + + return &LocalStack{ + container: c, + endpoint: fmt.Sprintf("http://%s:%s", host, port.Port()), + region: "us-east-1", + }, nil +} + +// Terminate stops and removes the LocalStack container. +func (ls *LocalStack) Terminate(ctx context.Context) error { + return ls.container.Terminate(ctx) +} + +// NewS3Client returns an AWS S3 client with LocalStack-friendly +// settings (path-style addressing, dummy credentials, checksum quirks +// disabled). +func (ls *LocalStack) NewS3Client(ctx context.Context, t *testing.T) *s3.Client { + t.Helper() + + cfg, err := awsconfig.LoadDefaultConfig(ctx, + awsconfig.WithRegion(ls.region), + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( + ls.AccessKey(), ls.SecretKey(), "", + )), + awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired), + awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired), + ) + if err != nil { + t.Fatalf("aws config: %v", err) + } + + return s3.NewFromConfig(cfg, func(o *s3.Options) { + o.BaseEndpoint = aws.String(ls.endpoint) + o.UsePathStyle = true + }) +} + +// NewBucket creates a fresh bucket and registers a t.Cleanup hook to +// best-effort delete it. Returns the bucket name. +func (ls *LocalStack) NewBucket(ctx context.Context, t *testing.T, prefix string) string { + t.Helper() + + cli := ls.NewS3Client(ctx, t) + name := uniqueName(prefix) + + if _, err := cli.CreateBucket(ctx, &s3.CreateBucketInput{ + Bucket: aws.String(name), + }); err != nil { + t.Fatalf("create bucket %s: %v", name, err) + } + + t.Cleanup(func() { + emptyBucket(context.Background(), cli, name) + + _, _ = cli.DeleteBucket(context.Background(), &s3.DeleteBucketInput{ //nolint:errcheck // best-effort cleanup + Bucket: aws.String(name), + }) + }) + + return name +} + +// EnableVersioning toggles versioning on a bucket. Used by the +// versioning-gate negative test. +func (ls *LocalStack) EnableVersioning(ctx context.Context, t *testing.T, bucket string) { + t.Helper() + + cli := ls.NewS3Client(ctx, t) + if _, err := cli.PutBucketVersioning(ctx, &s3.PutBucketVersioningInput{ + Bucket: aws.String(bucket), + VersioningConfiguration: &s3types.VersioningConfiguration{ + Status: s3types.BucketVersioningStatusEnabled, + }, + }); err != nil { + t.Fatalf("enable versioning on %s: %v", bucket, err) + } +} + +// emptyBucket deletes every object in the bucket. Best-effort; errors +// are ignored. +func emptyBucket(ctx context.Context, cli *s3.Client, bucket string) { + out, err := cli.ListObjectsV2(ctx, &s3.ListObjectsV2Input{ + Bucket: aws.String(bucket), + }) + if err != nil { + return + } + + for _, obj := range out.Contents { + _, _ = cli.DeleteObject(ctx, &s3.DeleteObjectInput{ //nolint:errcheck // best-effort cleanup + Bucket: aws.String(bucket), + Key: obj.Key, + }) + } +} diff --git a/internal/orca/inttest/main_test.go b/internal/orca/inttest/main_test.go new file mode 100644 index 00000000..f793abd6 --- /dev/null +++ b/internal/orca/inttest/main_test.go @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "fmt" + "os" + "testing" + "time" +) + +// Package-level container handles shared across tests in this package. +// TestMain brings them up once and tears them down at the end. +var ( + pkgLocalStack *LocalStack + pkgAzurite *Azurite +) + +// TestMain provisions LocalStack + Azurite once per `go test` run. +// Per-test buckets / containers are allocated inside individual tests +// to avoid cross-test interference. +func TestMain(m *testing.M) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + ls, err := StartLocalStack(ctx) + if err != nil { + fmt.Fprintf(os.Stderr, "TestMain: start localstack: %v\n", err) + os.Exit(1) + } + + pkgLocalStack = ls + + az, err := StartAzurite(ctx) + if err != nil { + fmt.Fprintf(os.Stderr, "TestMain: start azurite: %v\n", err) + + _ = ls.Terminate(ctx) //nolint:errcheck // best-effort cleanup + + os.Exit(1) + } + + pkgAzurite = az + + code := m.Run() + + termCtx, termCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer termCancel() + + _ = pkgAzurite.Terminate(termCtx) //nolint:errcheck // best-effort + _ = pkgLocalStack.Terminate(termCtx) //nolint:errcheck // best-effort + + os.Exit(code) +} diff --git a/internal/orca/inttest/origins_test.go b/internal/orca/inttest/origins_test.go new file mode 100644 index 00000000..df4012f6 --- /dev/null +++ b/internal/orca/inttest/origins_test.go @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "testing" + + "github.com/Azure/unbounded/internal/orca/origin" + "github.com/Azure/unbounded/internal/orca/origin/awss3" +) + +// localStackOrigin builds an awss3.Origin pointed at the package-level +// LocalStack with the given bucket. Used by tests that need to wrap +// the origin in a CountingOrigin decorator. +func localStackOrigin(ctx context.Context, t *testing.T, bucket string) (origin.Origin, error) { + t.Helper() + + return awss3.New(ctx, awss3.Config{ + Endpoint: pkgLocalStack.Endpoint(), + Region: pkgLocalStack.Region(), + Bucket: bucket, + AccessKey: pkgLocalStack.AccessKey(), + SecretKey: pkgLocalStack.SecretKey(), + UsePathStyle: true, + }, nil) +} diff --git a/internal/orca/inttest/originwrap.go b/internal/orca/inttest/originwrap.go new file mode 100644 index 00000000..c215d9e8 --- /dev/null +++ b/internal/orca/inttest/originwrap.go @@ -0,0 +1,67 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "io" + "sync/atomic" + + "github.com/Azure/unbounded/internal/orca/origin" +) + +// CountingOrigin is an origin.Origin decorator that counts Head and +// GetRange calls. It is used by tests that need to assert +// singleflight collapse and coordinator routing. +type CountingOrigin struct { + inner origin.Origin + + heads atomic.Int64 + getRanges atomic.Int64 + lists atomic.Int64 +} + +// NewCountingOrigin wraps inner with call counters. +func NewCountingOrigin(inner origin.Origin) *CountingOrigin { + return &CountingOrigin{inner: inner} +} + +// Heads returns the number of Head() calls observed. +func (c *CountingOrigin) Heads() int64 { return c.heads.Load() } + +// GetRanges returns the number of GetRange() calls observed. +func (c *CountingOrigin) GetRanges() int64 { return c.getRanges.Load() } + +// Lists returns the number of List() calls observed. +func (c *CountingOrigin) Lists() int64 { return c.lists.Load() } + +// Reset zeroes all counters. +func (c *CountingOrigin) Reset() { + c.heads.Store(0) + c.getRanges.Store(0) + c.lists.Store(0) +} + +// Head implements origin.Origin. +func (c *CountingOrigin) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + c.heads.Add(1) + + return c.inner.Head(ctx, bucket, key) +} + +// GetRange implements origin.Origin. +func (c *CountingOrigin) GetRange(ctx context.Context, bucket, key, etag string, off, length int64) (io.ReadCloser, error) { + c.getRanges.Add(1) + + return c.inner.GetRange(ctx, bucket, key, etag, off, length) +} + +// List implements origin.Origin. +func (c *CountingOrigin) List(ctx context.Context, bucket, prefix, marker string, maxKeys int) (origin.ListResult, error) { + c.lists.Add(1) + + return c.inner.List(ctx, bucket, prefix, marker, maxKeys) +} diff --git a/internal/orca/inttest/peersource.go b/internal/orca/inttest/peersource.go new file mode 100644 index 00000000..c349f601 --- /dev/null +++ b/internal/orca/inttest/peersource.go @@ -0,0 +1,67 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "context" + "sync" + + "github.com/Azure/unbounded/internal/orca/cluster" +) + +// StaticPeerSource implements cluster.PeerSource with a mutable peer +// list. Each replica in the harness owns its own StaticPeerSource so +// tests can mutate one replica's view of the cluster independently +// (used by TestPeerNotCoordinatorFallback to induce membership +// disagreement). +// +// The source knows its calling replica's identity (selfIP, selfPort) +// so it can stamp Peer.Self correctly even when multiple peers share +// an IP (the case in tests where every replica is on 127.0.0.1). +type StaticPeerSource struct { + mu sync.Mutex + selfIP string + selfPort int + peers []cluster.Peer +} + +// NewStaticPeerSource returns a peer source that stamps Self=true on +// any peer whose (IP, Port) matches the constructor arguments. +func NewStaticPeerSource(selfIP string, selfPort int, peers []cluster.Peer) *StaticPeerSource { + s := &StaticPeerSource{ + selfIP: selfIP, + selfPort: selfPort, + } + s.SetPeers(peers) + + return s +} + +// SetPeers replaces the current peer list. Each peer's Self bit is +// recomputed against the source's stored (selfIP, selfPort). +func (s *StaticPeerSource) SetPeers(peers []cluster.Peer) { + out := make([]cluster.Peer, len(peers)) + for i, p := range peers { + p.Self = p.IP == s.selfIP && p.Port == s.selfPort + out[i] = p + } + + s.mu.Lock() + defer s.mu.Unlock() + + s.peers = out +} + +// Peers satisfies cluster.PeerSource. +func (s *StaticPeerSource) Peers(_ context.Context) ([]cluster.Peer, error) { + s.mu.Lock() + defer s.mu.Unlock() + + out := make([]cluster.Peer, len(s.peers)) + copy(out, s.peers) + + return out, nil +} diff --git a/internal/orca/inttest/seed.go b/internal/orca/inttest/seed.go new file mode 100644 index 00000000..c286bcdc --- /dev/null +++ b/internal/orca/inttest/seed.go @@ -0,0 +1,96 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//go:build integrationtest + +package inttest + +import ( + "bytes" + "context" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/s3" +) + +// SeedBlob describes a single blob seeded into the origin. +type SeedBlob struct { + Key string + Data []byte +} + +// SmallBlob is one chunk's-worth (1 KiB). +func SmallBlob() SeedBlob { + return SeedBlob{Key: "sample-1k", Data: deterministicBytes(1024, 0xa1)} +} + +// MediumBlob spans two 1 MiB chunks. +func MediumBlob() SeedBlob { + return SeedBlob{Key: "sample-2chunk", Data: deterministicBytes(1024*1024+512*1024, 0xb2)} +} + +// HugeBlob spans 64 chunks at the harness's 1 MiB chunk size. With 3 +// replicas, rendezvous-hashed coordinator selection statistically +// covers every replica many times over (~21 chunks per replica), +// so any test using HugeBlob exercises the full local-fill + +// cross-replica /internal/fill matrix in a single run. +func HugeBlob() SeedBlob { + return SeedBlob{Key: "sample-64chunk", Data: deterministicBytes(64*1024*1024, 0xd4)} +} + +// AllBlobs returns the canonical seed set used across most tests. +func AllBlobs() []SeedBlob { + return []SeedBlob{SmallBlob(), MediumBlob(), HugeBlob()} +} + +// SeedS3 uploads each blob to the named bucket via the provided +// LocalStack-friendly S3 client. +func SeedS3(ctx context.Context, t *testing.T, cli *s3.Client, bucket string, blobs []SeedBlob) { + t.Helper() + + for _, b := range blobs { + if _, err := cli.PutObject(ctx, &s3.PutObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(b.Key), + Body: bytes.NewReader(b.Data), + }); err != nil { + t.Fatalf("seed %s/%s: %v", bucket, b.Key, err) + } + } +} + +// DeleteS3Object removes a blob from a LocalStack bucket. Used by +// warm-cache tests to prove that subsequent GETs are served from the +// cachestore and not refetched from the origin. +func DeleteS3Object(ctx context.Context, t *testing.T, cli *s3.Client, bucket, key string) { + t.Helper() + + if _, err := cli.DeleteObject(ctx, &s3.DeleteObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(key), + }); err != nil { + t.Fatalf("delete origin %s/%s: %v", bucket, key, err) + } +} + +// SeedAzure uploads each blob to the named container as block blobs. +func SeedAzure(ctx context.Context, t *testing.T, az *Azurite, ctr string, blobs []SeedBlob) { + t.Helper() + + for _, b := range blobs { + az.UploadBlockBlob(ctx, t, ctr, b.Key, b.Data) + } +} + +// deterministicBytes returns n bytes filled with a repeating pattern +// derived from seed. Useful for byte-exact assertions without random +// flakiness. +func deterministicBytes(n int, seed byte) []byte { + out := make([]byte, n) + for i := range out { + out[i] = seed ^ byte(i*31+17) + } + + return out +} diff --git a/internal/orca/manifests/doc.go b/internal/orca/manifests/doc.go new file mode 100644 index 00000000..a629d147 --- /dev/null +++ b/internal/orca/manifests/doc.go @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package manifests holds tests that validate the orca deployment +// manifest templates render to syntactically correct, structurally +// reasonable Kubernetes YAML. +// +// These tests catch typos, missing required fields, and template +// regressions at compile time without needing a Kind cluster. They +// complement (but do not replace) hack/orca's actual `kubectl apply` +// validation. +package manifests diff --git a/internal/orca/manifests/manifests_test.go b/internal/orca/manifests/manifests_test.go new file mode 100644 index 00000000..bbab6cab --- /dev/null +++ b/internal/orca/manifests/manifests_test.go @@ -0,0 +1,307 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package manifests + +import ( + "bytes" + "errors" + "io" + "os" + "path/filepath" + "runtime" + "sort" + "strings" + "testing" + + "gopkg.in/yaml.v3" + + "github.com/Azure/unbounded/hack/cmd/render-manifests/render" +) + +// TestProductionManifestsRender renders every *.yaml.tmpl under +// deploy/orca/ (excluding the dev/ subdirectory which contains the +// in-Kind LocalStack/Azurite manifests) with realistic inputs and +// asserts the output is structurally valid Kubernetes YAML. +func TestProductionManifestsRender(t *testing.T) { + t.Parallel() + + root := repoRoot(t) + templatesDir := filepath.Join(root, "deploy", "orca") + + renderAndValidate(t, templatesDir, productionData(), + // One file at a time: walking the dev/ subdirectory is the dev + // suite's job, so we render-then-skip it here. + skipDir("dev"), + // Required kinds that MUST appear at least once across the + // rendered manifests. + expectKindsAtLeastOnce("Namespace", "Deployment", "Service", "ConfigMap"), + ) +} + +// TestDevManifestsRender renders the LocalStack + Azurite + init-Job +// manifests used by the Kind dev harness. +func TestDevManifestsRender(t *testing.T) { + t.Parallel() + + root := repoRoot(t) + templatesDir := filepath.Join(root, "deploy", "orca", "dev") + + renderAndValidate(t, templatesDir, devData(), + expectKindsAtLeastOnce("Deployment", "Service", "Job"), + ) +} + +// productionData supplies realistic template variables for the +// production-shape templates. Templates use sprig's `default` for +// missing keys; we set values that exercise the non-default paths +// where it matters. +func productionData() map[string]string { + return map[string]string{ + "Namespace": "orca-test", + "Image": "ghcr.io/example/orca:test", + "ImagePullPolicy": "IfNotPresent", + "TargetReplicas": "3", + "OriginID": "test-origin", + "OriginDriver": "awss3", + "OriginAWSS3Endpoint": "http://localstack:4566", + "OriginAWSS3Region": "us-east-1", + "OriginAWSS3Bucket": "orca-origin", + "OriginAWSS3UsePathStyle": "true", + "CachestoreEndpoint": "http://localstack:4566", + "CachestoreBucket": "orca-cache", + "CachestoreRegion": "us-east-1", + "ClusterService": "orca-peers.orca-test.svc.cluster.local", + "ServerAuthEnabled": "false", + "InternalTLSEnabled": "false", + "AzureAccount": "", + "AzureContainer": "", + "AzureEndpoint": "", + } +} + +func devData() map[string]string { + return map[string]string{ + "Namespace": "orca-test", + "CachestoreBucket": "orca-cache", + "OriginBucket": "orca-origin", + "AzuriteContainer": "orca-test", + } +} + +// renderAndValidate renders every template under templatesDir into a +// t.TempDir, then walks the output and applies each Validator. +func renderAndValidate(t *testing.T, templatesDir string, data map[string]string, validators ...Validator) { + t.Helper() + + outputDir := t.TempDir() + + if err := render.Render(templatesDir, outputDir, data); err != nil { + t.Fatalf("render.Render: %v", err) + } + // Collect every rendered .yaml file. Skip directories filtered + // by the validators. + skipDirs := skipDirsOf(validators) + + var renderedFiles []string + + walkErr := filepath.WalkDir(outputDir, func(path string, d os.DirEntry, err error) error { + if err != nil { + return err + } + + if d.IsDir() { + rel, _ := filepath.Rel(outputDir, path) + if _, skip := skipDirs[rel]; skip { + return filepath.SkipDir + } + + return nil + } + + if strings.HasSuffix(path, ".yaml") { + renderedFiles = append(renderedFiles, path) + } + + return nil + }) + if walkErr != nil { + t.Fatalf("walk rendered output: %v", walkErr) + } + + if len(renderedFiles) == 0 { + t.Fatalf("no rendered manifests found under %s", outputDir) + } + + sort.Strings(renderedFiles) + + docs := parseRenderedDocs(t, renderedFiles) + + // Always-on basic structural validation. + for _, d := range docs { + validateBasicStructure(t, d) + } + + for _, v := range validators { + v.Validate(t, docs) + } +} + +// renderedDoc is one logical YAML document plus the source file it +// came from (multi-doc files split into multiple renderedDocs). +type renderedDoc struct { + SourcePath string + Index int + Doc map[string]any +} + +func parseRenderedDocs(t *testing.T, files []string) []renderedDoc { + t.Helper() + + var docs []renderedDoc + + for _, f := range files { + raw, err := os.ReadFile(f) + if err != nil { + t.Fatalf("read %s: %v", f, err) + } + + dec := yaml.NewDecoder(bytes.NewReader(raw)) + + for i := 0; ; i++ { + var doc map[string]any + if derr := dec.Decode(&doc); derr != nil { + if errors.Is(derr, io.EOF) { + break + } + + t.Fatalf("yaml decode %s doc %d: %v", f, i, derr) + } + + if doc == nil { + continue + } + + docs = append(docs, renderedDoc{SourcePath: f, Index: i, Doc: doc}) + } + } + + return docs +} + +func validateBasicStructure(t *testing.T, d renderedDoc) { + t.Helper() + + apiVersion, _ := d.Doc["apiVersion"].(string) + kind, _ := d.Doc["kind"].(string) + + if apiVersion == "" { + t.Errorf("%s doc %d: missing apiVersion", d.SourcePath, d.Index) + } + + if kind == "" { + t.Errorf("%s doc %d: missing kind", d.SourcePath, d.Index) + } + + meta, _ := d.Doc["metadata"].(map[string]any) + if meta == nil { + t.Errorf("%s doc %d (kind=%s): missing metadata", d.SourcePath, d.Index, kind) + return + } + + name, _ := meta["name"].(string) + if name == "" { + t.Errorf("%s doc %d (kind=%s): missing metadata.name", d.SourcePath, d.Index, kind) + } +} + +// Validator is a test-time check applied to the full set of +// rendered docs. +type Validator interface { + Validate(t *testing.T, docs []renderedDoc) + skipDir() string // empty when not a dir filter +} + +type kindsAtLeastOnce struct{ kinds []string } + +func (v kindsAtLeastOnce) Validate(t *testing.T, docs []renderedDoc) { + t.Helper() + + seen := map[string]bool{} + + for _, d := range docs { + if k, _ := d.Doc["kind"].(string); k != "" { + seen[k] = true + } + } + + for _, want := range v.kinds { + if !seen[want] { + t.Errorf("expected at least one document of kind %q, got kinds %v", want, sortedKeys(seen)) + } + } +} + +func (v kindsAtLeastOnce) skipDir() string { return "" } + +func expectKindsAtLeastOnce(kinds ...string) Validator { + return kindsAtLeastOnce{kinds: kinds} +} + +type dirSkipper struct{ name string } + +func (d dirSkipper) Validate(*testing.T, []renderedDoc) {} + +func (d dirSkipper) skipDir() string { return d.name } + +func skipDir(name string) Validator { + return dirSkipper{name: name} +} + +func skipDirsOf(vs []Validator) map[string]struct{} { + out := map[string]struct{}{} + + for _, v := range vs { + if d := v.skipDir(); d != "" { + out[d] = struct{}{} + } + } + + return out +} + +func sortedKeys(m map[string]bool) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + + sort.Strings(out) + + return out +} + +// repoRoot returns the absolute path to the repo root by walking up +// from this test file's directory until it finds a go.mod. +func repoRoot(t *testing.T) string { + t.Helper() + + _, file, _, ok := runtime.Caller(0) + if !ok { + t.Fatal("runtime.Caller(0) failed") + } + + dir := filepath.Dir(file) + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + + parent := filepath.Dir(dir) + if parent == dir { + t.Fatalf("reached filesystem root without finding go.mod (started at %s)", filepath.Dir(file)) + } + + dir = parent + } +} diff --git a/internal/orca/metadata/metadata.go b/internal/orca/metadata/metadata.go new file mode 100644 index 00000000..e122463c --- /dev/null +++ b/internal/orca/metadata/metadata.go @@ -0,0 +1,331 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package metadata is the per-replica object-metadata cache. +// +// Responsibilities: +// - bounded TTL'd cache of ObjectInfo keyed on (origin_id, bucket, +// key) +// - separate negative-TTL handling for 404 / unsupported-blob-type +// entries +// - per-replica HEAD singleflight so concurrent misses collapse to +// one Origin.Head +package metadata + +import ( + "container/list" + "context" + "encoding/binary" + "errors" + "log/slog" + "strings" + "sync" + "time" + + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// Cache is the per-replica metadata cache. +type Cache struct { + cfg config.Metadata + log *slog.Logger + + mu sync.Mutex + ll *list.List + idx map[string]*list.Element + + sf sync.Map // map[string]*sfEntry +} + +type cacheEntry struct { + key string + info origin.ObjectInfo + negative bool + negErr error + expiresAt time.Time +} + +type sfEntry struct { + once sync.Once + done chan struct{} + info origin.ObjectInfo + err error +} + +// NewCache builds a Cache from config. The log is used at debug +// level for cache hit / miss / record / invalidate trace lines and +// at warn level for unexpected backend errors caught during result +// recording. Passing nil falls back to slog.Default(). +func NewCache(cfg config.Metadata, log *slog.Logger) *Cache { + if cfg.MaxEntries <= 0 { + cfg.MaxEntries = 10_000 + } + + if cfg.TTL <= 0 { + cfg.TTL = 5 * time.Minute + } + + if cfg.NegativeTTL <= 0 { + cfg.NegativeTTL = 60 * time.Second + } + + if log == nil { + log = slog.Default() + } + + return &Cache{ + cfg: cfg, + log: log, + ll: list.New(), + idx: make(map[string]*list.Element, cfg.MaxEntries), + } +} + +// lookup returns the cached ObjectInfo if present and unexpired. +// +// Returns: +// - info, true, nil -> positive cache hit +// - {}, true, err -> negative cache hit (err is the cached error) +// - {}, false, nil -> miss; caller should LookupOrFetch +func (c *Cache) lookup(originID, bucket, key string) (origin.ObjectInfo, bool, error) { + k := mkKey(originID, bucket, key) + + c.mu.Lock() + defer c.mu.Unlock() + + el, ok := c.idx[k] + if !ok { + return origin.ObjectInfo{}, false, nil + } + + // The list is private; we control every value inserted (always + // *cacheEntry). The type assertion is safe. + e := el.Value.(*cacheEntry) //nolint:errcheck // type invariant: list elements are *cacheEntry + + if time.Now().After(e.expiresAt) { + c.ll.Remove(el) + delete(c.idx, k) + + return origin.ObjectInfo{}, false, nil + } + + c.ll.MoveToFront(el) + + if e.negative { + return origin.ObjectInfo{}, true, e.negErr + } + + return e.info, true, nil +} + +// LookupOrFetch returns the cached ObjectInfo on hit (positive or +// negative); on miss, runs the per-replica HEAD singleflight against +// fetch and caches the result with the appropriate TTL. +// +// Singleflight tradeoff: the first caller (leader) drives fetch with +// its own ctx. If the leader's ctx is cancelled mid-fetch, joiners +// observe the leader's resulting ctx-error rather than their own +// (still-valid) ctx. This is the standard singleflight contract; a +// joiner can re-issue after seeing ctx.Err on a closed sfe.done if +// it wants to drive its own attempt. +func (c *Cache) LookupOrFetch( + ctx context.Context, + originID, bucket, key string, + fetch func(ctx context.Context) (origin.ObjectInfo, error), +) (origin.ObjectInfo, error) { + if info, ok, err := c.lookup(originID, bucket, key); ok { + hitKind := "positive" + if err != nil { + hitKind = "negative" + } + + c.log.LogAttrs(ctx, slog.LevelDebug, "metadata_hit", + slog.String("origin_id", originID), + slog.String("bucket", bucket), + slog.String("key", key), + slog.String("kind", hitKind), + ) + + return info, err + } + + k := mkKey(originID, bucket, key) + v, _ := c.sf.LoadOrStore(k, &sfEntry{done: make(chan struct{})}) + + // The sync.Map only ever holds *sfEntry; the type assertion is safe. + sfe := v.(*sfEntry) //nolint:errcheck // type invariant: sf map values are *sfEntry + + first := false + + sfe.once.Do(func() { + first = true + }) + + if first { + c.log.LogAttrs(ctx, slog.LevelDebug, "metadata_singleflight_leader", + slog.String("origin_id", originID), + slog.String("bucket", bucket), + slog.String("key", key), + ) + // Delete the singleflight entry before closing done so a new + // caller arriving after Delete creates a fresh entry instead + // of silently replaying our (possibly transient-error) result. + // Existing joiners already loaded the old pointer and read the + // result via the closed done. The brief window between Delete + // and close where a new caller starts a concurrent fetch is + // benign: the new fetch either confirms or supersedes our + // result. + defer func() { + c.sf.Delete(k) + close(sfe.done) + }() + + info, err := fetch(ctx) + sfe.info = info + sfe.err = err + + c.recordResult(ctx, originID, bucket, key, info, err) + + return info, err + } + + c.log.LogAttrs(ctx, slog.LevelDebug, "metadata_singleflight_join", + slog.String("origin_id", originID), + slog.String("bucket", bucket), + slog.String("key", key), + ) + // Joiner: wait for the leader. + select { + case <-ctx.Done(): + return origin.ObjectInfo{}, ctx.Err() + case <-sfe.done: + } + + return sfe.info, sfe.err +} + +// Invalidate drops the entry. +func (c *Cache) Invalidate(originID, bucket, key string) { + k := mkKey(originID, bucket, key) + + c.mu.Lock() + defer c.mu.Unlock() + + if el, ok := c.idx[k]; ok { + c.ll.Remove(el) + delete(c.idx, k) + c.log.LogAttrs(context.Background(), slog.LevelDebug, "metadata_invalidate", + slog.String("origin_id", originID), + slog.String("bucket", bucket), + slog.String("key", key), + ) + } +} + +func (c *Cache) recordResult(ctx context.Context, originID, bucket, key string, info origin.ObjectInfo, err error) { + k := mkKey(originID, bucket, key) + + c.mu.Lock() + defer c.mu.Unlock() + + now := time.Now() + + var ( + e *cacheEntry + recorded string + ttl time.Duration + ) + + switch { + case err == nil: + e = &cacheEntry{key: k, info: info, expiresAt: now.Add(c.cfg.TTL)} + recorded = "positive" + ttl = c.cfg.TTL + case errors.Is(err, origin.ErrNotFound): + e = &cacheEntry{key: k, negative: true, negErr: err, expiresAt: now.Add(c.cfg.NegativeTTL)} + recorded = "not_found" + ttl = c.cfg.NegativeTTL + default: + var ( + ube *origin.UnsupportedBlobTypeError + mte *origin.MissingETagError + ) + + switch { + case errors.As(err, &ube): + e = &cacheEntry{key: k, negative: true, negErr: err, expiresAt: now.Add(c.cfg.NegativeTTL)} + recorded = "unsupported_blob_type" + ttl = c.cfg.NegativeTTL + case errors.As(err, &mte): + e = &cacheEntry{key: k, negative: true, negErr: err, expiresAt: now.Add(c.cfg.NegativeTTL)} + recorded = "missing_etag" + ttl = c.cfg.NegativeTTL + default: + c.log.LogAttrs(ctx, slog.LevelDebug, "metadata_record_skip_transient", + slog.String("origin_id", originID), + slog.String("bucket", bucket), + slog.String("key", key), + slog.Any("err", err), + ) + // Other transient errors not cached. + return + } + } + + if existing, ok := c.idx[k]; ok { + c.ll.Remove(existing) + delete(c.idx, k) + } + + el := c.ll.PushFront(e) + + c.idx[k] = el + for c.ll.Len() > c.cfg.MaxEntries { + oldest := c.ll.Back() + if oldest == nil { + break + } + + c.ll.Remove(oldest) + + oldEntry := oldest.Value.(*cacheEntry) //nolint:errcheck // type invariant: list elements are *cacheEntry + delete(c.idx, oldEntry.key) + } + + c.log.LogAttrs(ctx, slog.LevelDebug, "metadata_record", + slog.String("origin_id", originID), + slog.String("bucket", bucket), + slog.String("key", key), + slog.String("kind", recorded), + slog.Duration("ttl", ttl), + ) +} + +// mkKey builds an in-memory cache key from (originID, bucket, key). +// The encoding is length-prefixed: each field is written as an +// 8-byte little-endian length followed by the field bytes. This +// guarantees that two distinct triples cannot collide on the +// rendered key. A naive 'origin|bucket|key' concatenation would +// alias e.g. (origin="a|b", bucket="c", key="d") and +// (origin="a", bucket="b|c", key="d") because S3 object keys may +// legally contain '|'. The cache is purely in-memory so this +// encoding has no on-disk compatibility implications. +func mkKey(originID, bucket, key string) string { + var b strings.Builder + + b.Grow(24 + len(originID) + len(bucket) + len(key)) + writeLP(&b, originID) + writeLP(&b, bucket) + writeLP(&b, key) + + return b.String() +} + +func writeLP(b *strings.Builder, s string) { + var lenBuf [8]byte + + binary.LittleEndian.PutUint64(lenBuf[:], uint64(len(s))) + b.Write(lenBuf[:]) + b.WriteString(s) +} diff --git a/internal/orca/metadata/metadata_test.go b/internal/orca/metadata/metadata_test.go new file mode 100644 index 00000000..81b25283 --- /dev/null +++ b/internal/orca/metadata/metadata_test.go @@ -0,0 +1,261 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package metadata + +import ( + "bytes" + "context" + "errors" + "io" + "log/slog" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// TestLookupOrFetch_TransientErrorNotReplayed verifies that after the +// leader of a singleflight fetch returns a transient (non-cached) +// error, a subsequent call to LookupOrFetch invokes fetch again +// rather than silently replaying the cached error. +// +// Regression test for the defer-order race: with `close(done)` before +// `Delete`, a second caller arriving in the gap would land on the +// stale singleflight entry and skip fetch entirely. +func TestLookupOrFetch_TransientErrorNotReplayed(t *testing.T) { + t.Parallel() + + c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil) + + var calls atomic.Int64 + + transientErr := errors.New("transient: try again") + + fetch := func(_ context.Context) (origin.ObjectInfo, error) { + calls.Add(1) + return origin.ObjectInfo{}, transientErr + } + + // Sequential calls: each must invoke fetch, never replay. + for i := 0; i < 5; i++ { + _, err := c.LookupOrFetch(t.Context(), "origin", "bucket", "key", fetch) + if !errors.Is(err, transientErr) { + t.Fatalf("call %d: err=%v want %v", i, err, transientErr) + } + } + + if got := calls.Load(); got != 5 { + t.Errorf("fetch invoked %d times, want 5 (transient errors must not be cached)", got) + } +} + +// TestLookupOrFetch_PositiveResultCached verifies positive results +// are served from the cache without re-invoking fetch. +func TestLookupOrFetch_PositiveResultCached(t *testing.T) { + t.Parallel() + + c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil) + + var calls atomic.Int64 + + want := origin.ObjectInfo{Size: 1234, ETag: "abc"} + + fetch := func(_ context.Context) (origin.ObjectInfo, error) { + calls.Add(1) + return want, nil + } + + for i := 0; i < 5; i++ { + got, err := c.LookupOrFetch(t.Context(), "origin", "bucket", "key", fetch) + if err != nil { + t.Fatalf("call %d: err=%v", i, err) + } + + if got != want { + t.Errorf("call %d: got %+v want %+v", i, got, want) + } + } + + if got := calls.Load(); got != 1 { + t.Errorf("fetch invoked %d times, want 1 (positive results must be cached)", got) + } +} + +// TestLookupOrFetch_NotFoundCached verifies origin.ErrNotFound is +// negatively cached and replayed without re-invoking fetch. +func TestLookupOrFetch_NotFoundCached(t *testing.T) { + t.Parallel() + + c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil) + + var calls atomic.Int64 + + fetch := func(_ context.Context) (origin.ObjectInfo, error) { + calls.Add(1) + return origin.ObjectInfo{}, origin.ErrNotFound + } + + for i := 0; i < 3; i++ { + _, err := c.LookupOrFetch(t.Context(), "origin", "bucket", "key", fetch) + if !errors.Is(err, origin.ErrNotFound) { + t.Fatalf("call %d: err=%v want ErrNotFound", i, err) + } + } + + if got := calls.Load(); got != 1 { + t.Errorf("fetch invoked %d times, want 1 (ErrNotFound must be negatively cached)", got) + } +} + +// TestLookupOrFetch_ConcurrentJoinersCollapse verifies that +// simultaneous callers for the same key collapse to a single fetch. +func TestLookupOrFetch_ConcurrentJoinersCollapse(t *testing.T) { + t.Parallel() + + c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil) + + var calls atomic.Int64 + + gate := make(chan struct{}) + want := origin.ObjectInfo{Size: 42} + + fetch := func(_ context.Context) (origin.ObjectInfo, error) { + calls.Add(1) + <-gate // pin the leader until joiners have arrived + + return want, nil + } + + const n = 8 + + var ( + wg sync.WaitGroup + results = make([]origin.ObjectInfo, n) + errs = make([]error, n) + ) + + wg.Add(n) + + for i := 0; i < n; i++ { + go func(i int) { + defer wg.Done() + + results[i], errs[i] = c.LookupOrFetch(t.Context(), "origin", "bucket", "key", fetch) + }(i) + } + + time.Sleep(50 * time.Millisecond) // let everyone arrive at the singleflight + close(gate) + wg.Wait() + + if got := calls.Load(); got != 1 { + t.Errorf("fetch invoked %d times, want 1 (joiners must collapse)", got) + } + + for i, err := range errs { + if err != nil { + t.Errorf("call %d: err=%v", i, err) + } + + if results[i] != want { + t.Errorf("call %d: got %+v want %+v", i, results[i], want) + } + } +} + +// TestMkKey_PipeCollisionResolved verifies that length-prefixed +// encoding distinguishes (origin, bucket, key) triples that +// previously aliased on the pipe-delimited concatenation. +// +// Under the old 'origin|bucket|key' shape, S3 object keys legally +// containing '|' could produce key collisions across distinct +// triples: ("a|b","c","d") and ("a","b|c","d") rendered to the +// same string. The length-prefix encoding guarantees uniqueness. +func TestMkKey_PipeCollisionResolved(t *testing.T) { + t.Parallel() + + a := mkKey("a|b", "c", "d") + b := mkKey("a", "b|c", "d") + + if a == b { + t.Errorf("pipe-delimited collision: mkKey(%q,%q,%q) == mkKey(%q,%q,%q) = %q", + "a|b", "c", "d", "a", "b|c", "d", a) + } +} + +// TestNewCache_UsesInjectedLogger locks the contract that the +// metadata cache uses the caller's logger rather than slog.Default. +func TestNewCache_UsesInjectedLogger(t *testing.T) { + t.Parallel() + + injected := slog.New(slog.NewTextHandler(io.Discard, nil)) + c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, injected) + + if c.log != injected { + t.Errorf("metadata.Cache.log not the injected logger") + } +} + +// TestNewCache_NilLoggerFallsBackToDefault verifies the nil-logger +// fallback so a misconfigured caller does not panic on the first +// trace emission. +func TestNewCache_NilLoggerFallsBackToDefault(t *testing.T) { + t.Parallel() + + c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, nil) + if c.log == nil { + t.Errorf("nil logger should have fallen back to slog.Default()") + } +} + +// TestLookupOrFetch_EmitsDebugTraces verifies that the metadata +// cache emits the documented debug-level emissions on the leader, +// joiner, hit, and record-result paths. The contract under test is +// the named messages and the (origin_id, bucket, key) attribute +// triple - operators rely on these for diagnosing cache-hit +// patterns. +func TestLookupOrFetch_EmitsDebugTraces(t *testing.T) { + t.Parallel() + + var buf bytes.Buffer + + log := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug})) + c := NewCache(config.Metadata{TTL: time.Minute, NegativeTTL: time.Minute, MaxEntries: 16}, log) + + want := origin.ObjectInfo{Size: 42, ETag: "etag"} + // First call: leader path + positive record. + info, err := c.LookupOrFetch(context.Background(), "ox", "bkt", "obj", + func(_ context.Context) (origin.ObjectInfo, error) { + return want, nil + }) + if err != nil || info.Size != 42 { + t.Fatalf("LookupOrFetch leader: info=%+v err=%v", info, err) + } + // Second call: cache hit path. The fetch function must not run. + _, err = c.LookupOrFetch(context.Background(), "ox", "bkt", "obj", + func(_ context.Context) (origin.ObjectInfo, error) { + t.Fatalf("fetch should not run on cache hit") + return origin.ObjectInfo{}, nil + }) + if err != nil { + t.Fatalf("LookupOrFetch hit: %v", err) + } + + out := buf.String() + for _, want := range []string{ + "metadata_singleflight_leader", + "metadata_record", + "metadata_hit", + "bucket=bkt", + "key=obj", + } { + if !strings.Contains(out, want) { + t.Errorf("expected %q in debug output; got %q", want, out) + } + } +} diff --git a/internal/orca/origin/awss3/awss3.go b/internal/orca/origin/awss3/awss3.go new file mode 100644 index 00000000..d803ced4 --- /dev/null +++ b/internal/orca/origin/awss3/awss3.go @@ -0,0 +1,378 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package awss3 is the AWS S3 (and S3-compatible) origin driver. It +// targets either real AWS S3 or a local S3-compatible endpoint such as +// LocalStack. Useful as a credential-free origin for the dev harness: +// LocalStack acts as both origin and cachestore (different buckets). +// +// This driver is read-only from Orca's perspective (Head, GetRange, +// List). The seed step that uploads test objects to the origin bucket +// happens out-of-band via aws-cli or similar. +package awss3 + +import ( + "context" + "errors" + "fmt" + "io" + "log/slog" + "net/http" + "strings" + + "github.com/aws/aws-sdk-go-v2/aws" + awshttp "github.com/aws/aws-sdk-go-v2/aws/transport/http" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/aws/smithy-go" + + "github.com/Azure/unbounded/internal/orca/origin" +) + +// Adapter implements origin.Origin against an S3-compatible endpoint. +type Adapter struct { + cfg Config + client *s3.Client + log *slog.Logger +} + +// Config is the awss3-driver configuration. Mirrors config.AWSS3 but +// kept package-local so the driver can be unit-tested without +// importing the whole config package. +type Config struct { + // Endpoint, when set, overrides the regional default and routes + // requests at a custom URL (LocalStack uses + // http://localstack:4566). Leave empty for real AWS S3. + Endpoint string + + // Region is the AWS region. LocalStack ignores this; the SDK + // requires a value. + Region string + + // Bucket is the source bucket holding origin objects. + Bucket string + + // AccessKey / SecretKey are static credentials. For LocalStack + // these are "test"/"test"; for real AWS, supply real creds. + AccessKey string + SecretKey string + + // UsePathStyle: true for LocalStack (host-based addressing + // requires DNS wildcards LocalStack does not provide). + UsePathStyle bool +} + +// New constructs an Adapter. The log receives debug-level +// emissions for every Head / GetRange / List call and the error +// mapping decision (not-found / auth / precondition) on failure +// paths. Passing nil falls back to slog.Default(). +func New(ctx context.Context, cfg Config, log *slog.Logger) (*Adapter, error) { + if cfg.Bucket == "" { + return nil, fmt.Errorf("origin/awss3: bucket required") + } + + if cfg.Region == "" { + cfg.Region = "us-east-1" + } + + awsCfg, err := awsconfig.LoadDefaultConfig(ctx, + awsconfig.WithRegion(cfg.Region), + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider( + cfg.AccessKey, cfg.SecretKey, "", + )), + // Opt out of CRC64NVME default introduced in aws-sdk-go-v2 + // 1.32. LocalStack 3.8 returns InvalidRequest for unknown + // algorithms; real AWS S3 still works either way. + awsconfig.WithRequestChecksumCalculation(aws.RequestChecksumCalculationWhenRequired), + awsconfig.WithResponseChecksumValidation(aws.ResponseChecksumValidationWhenRequired), + ) + if err != nil { + return nil, fmt.Errorf("origin/awss3: aws config: %w", err) + } + + client := s3.NewFromConfig(awsCfg, func(o *s3.Options) { + if cfg.Endpoint != "" { + o.BaseEndpoint = aws.String(cfg.Endpoint) + } + + o.UsePathStyle = cfg.UsePathStyle + }) + + if log == nil { + log = slog.Default() + } + + return &Adapter{cfg: cfg, client: client, log: log}, nil +} + +// Head returns ObjectInfo for the named object. The bucket arg lets +// callers override the configured bucket; if empty, the configured +// bucket is used. +func (a *Adapter) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + b := bucket + if b == "" { + b = a.cfg.Bucket + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_head_request", + slog.String("bucket", b), + slog.String("key", key), + ) + + out, err := a.client.HeadObject(ctx, &s3.HeadObjectInput{ + Bucket: aws.String(b), + Key: aws.String(key), + }) + if err != nil { + if isNotFound(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_head_not_found", + slog.String("bucket", b), + slog.String("key", key), + ) + + return origin.ObjectInfo{LastStatus: http.StatusNotFound}, origin.ErrNotFound + } + + if isAuth(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_head_auth", + slog.String("bucket", b), + slog.String("key", key), + ) + + return origin.ObjectInfo{}, origin.ErrAuth + } + + return origin.ObjectInfo{}, fmt.Errorf("awss3 head: %w", err) + } + + info := origin.ObjectInfo{LastStatus: http.StatusOK} + if out.ContentLength != nil { + info.Size = *out.ContentLength + } + + if out.ETag != nil { + info.ETag = strings.Trim(*out.ETag, "\"") + } + + if out.ContentType != nil { + info.ContentType = *out.ContentType + } + + if out.LastModified != nil { + info.LastValidated = *out.LastModified + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_head_response", + slog.String("bucket", b), + slog.String("key", key), + slog.Int64("size", info.Size), + slog.String("etag", origin.ETagShort(info.ETag)), + ) + + return info, nil +} + +// GetRange fetches [off, off+n) of the object, sending If-Match: . +func (a *Adapter) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) { + b := bucket + if b == "" { + b = a.cfg.Bucket + } + + rng := fmt.Sprintf("bytes=%d-%d", off, off+n-1) + + in := &s3.GetObjectInput{ + Bucket: aws.String(b), + Key: aws.String(key), + Range: aws.String(rng), + } + if etag != "" { + // S3 expects the etag wrapped in double quotes. + in.IfMatch = aws.String("\"" + etag + "\"") + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_get_range_request", + slog.String("bucket", b), + slog.String("key", key), + slog.String("etag", origin.ETagShort(etag)), + slog.Int64("off", off), + slog.Int64("n", n), + ) + + out, err := a.client.GetObject(ctx, in) + if err != nil { + if isPreconditionFailed(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_get_range_etag_changed", + slog.String("bucket", b), + slog.String("key", key), + slog.String("want_etag", origin.ETagShort(etag)), + ) + + return nil, &origin.OriginETagChangedError{ + Bucket: b, Key: key, Want: etag, + } + } + + if isNotFound(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_get_range_not_found", + slog.String("bucket", b), + slog.String("key", key), + ) + + return nil, origin.ErrNotFound + } + + if isAuth(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_get_range_auth", + slog.String("bucket", b), + slog.String("key", key), + ) + + return nil, origin.ErrAuth + } + + return nil, fmt.Errorf("awss3 get-range: %w", err) + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_get_range_response", + slog.String("bucket", b), + slog.String("key", key), + ) + + return out.Body, nil +} + +// List enumerates objects under prefix. +func (a *Adapter) List(ctx context.Context, bucket, prefix, marker string, maxResults int) (origin.ListResult, error) { + b := bucket + if b == "" { + b = a.cfg.Bucket + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_list_request", + slog.String("bucket", b), + slog.String("prefix", prefix), + slog.String("marker", marker), + slog.Int("max", maxResults), + ) + + in := &s3.ListObjectsV2Input{ + Bucket: aws.String(b), + Prefix: aws.String(prefix), + MaxKeys: aws.Int32(int32(maxResults)), + } + if marker != "" { + in.ContinuationToken = aws.String(marker) + } + + out, err := a.client.ListObjectsV2(ctx, in) + if err != nil { + if isAuth(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_list_auth", + slog.String("bucket", b), + ) + + return origin.ListResult{}, origin.ErrAuth + } + + return origin.ListResult{}, fmt.Errorf("awss3 list: %w", err) + } + + res := origin.ListResult{} + + for _, item := range out.Contents { + entry := origin.ObjectEntry{} + if item.Key != nil { + entry.Key = *item.Key + } + + if item.Size != nil { + entry.Size = *item.Size + } + + if item.ETag != nil { + entry.ETag = strings.Trim(*item.ETag, "\"") + } + + res.Entries = append(res.Entries, entry) + } + + if out.IsTruncated != nil { + res.IsTruncated = *out.IsTruncated + } + + if out.NextContinuationToken != nil { + res.NextMarker = *out.NextContinuationToken + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "awss3_list_response", + slog.String("bucket", b), + slog.Int("count", len(res.Entries)), + slog.Bool("truncated", res.IsTruncated), + ) + + return res, nil +} + +func isNotFound(err error) bool { + var nsk *s3types.NoSuchKey + if errors.As(err, &nsk) { + return true + } + + var nsb *s3types.NoSuchBucket + if errors.As(err, &nsb) { + return true + } + + var notFound *s3types.NotFound + if errors.As(err, ¬Found) { + return true + } + + var respErr *awshttp.ResponseError + if errors.As(err, &respErr) && respErr.Response != nil && + respErr.Response.StatusCode == http.StatusNotFound { + return true + } + + return false +} + +func isAuth(err error) bool { + var apiErr smithy.APIError + if errors.As(err, &apiErr) { + switch apiErr.ErrorCode() { + case "AccessDenied", "Unauthorized", "Forbidden", "InvalidAccessKeyId", "SignatureDoesNotMatch": + return true + } + } + + var respErr *awshttp.ResponseError + if errors.As(err, &respErr) && respErr.Response != nil { + status := respErr.Response.StatusCode + if status == http.StatusUnauthorized || status == http.StatusForbidden { + return true + } + } + + return false +} + +// isPreconditionFailed reports whether err carries an HTTP 412 +// Precondition Failed response. Used to translate +// If-Match-rejected GetRange calls into the orca-internal +// OriginETagChangedError. We rely on the HTTP status code on the +// underlying *awshttp.ResponseError rather than service error +// codes; the status code is part of the stable wire contract +// across SDK and backend versions. +func isPreconditionFailed(err error) bool { + var respErr *awshttp.ResponseError + if errors.As(err, &respErr) && respErr.Response != nil { + return respErr.Response.StatusCode == http.StatusPreconditionFailed + } + + return false +} diff --git a/internal/orca/origin/awss3/awss3_test.go b/internal/orca/origin/awss3/awss3_test.go new file mode 100644 index 00000000..ac8fd11f --- /dev/null +++ b/internal/orca/origin/awss3/awss3_test.go @@ -0,0 +1,125 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package awss3 + +import ( + "errors" + "net/http" + "testing" + + awshttp "github.com/aws/aws-sdk-go-v2/aws/transport/http" + s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" + smithy "github.com/aws/smithy-go" + smithyhttp "github.com/aws/smithy-go/transport/http" +) + +// makeResponseErr builds an *awshttp.ResponseError wrapping the +// given HTTP status code. Mirrors how the AWS SDK surfaces service +// errors to callers. +func makeResponseErr(status int, inner error) *awshttp.ResponseError { + return &awshttp.ResponseError{ + ResponseError: &smithyhttp.ResponseError{ + Response: &smithyhttp.Response{ + Response: &http.Response{StatusCode: status}, + }, + Err: inner, + }, + } +} + +// fakeAPIError implements smithy.APIError for testing service-code +// matching paths (AccessDenied / typed-not-found etc). +type fakeAPIError struct{ code string } + +func (e *fakeAPIError) Error() string { return e.code } +func (e *fakeAPIError) ErrorCode() string { return e.code } +func (e *fakeAPIError) ErrorMessage() string { return e.code } +func (e *fakeAPIError) ErrorFault() smithy.ErrorFault { return smithy.FaultUnknown } +func (e *fakeAPIError) HTTPStatusCode() int { return 0 } + +// TestIsPreconditionFailed_FromHTTPStatus verifies that only an HTTP +// 412 response satisfies the predicate. The previous implementation +// matched service codes 'PreconditionFailed' and +// 'ConditionalRequestConflict' plus a substring fallback on +// err.Error(), which was both incomplete (didn't cover backends +// returning only the status) and fragile (false positives on +// arbitrary error messages containing '412'). +func TestIsPreconditionFailed_FromHTTPStatus(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want bool + }{ + {"412 ResponseError -> true", makeResponseErr(412, errors.New("precondition")), true}, + {"500 ResponseError -> false", makeResponseErr(500, errors.New("ise")), false}, + {"404 ResponseError -> false", makeResponseErr(404, errors.New("not found")), false}, + {"plain error -> false", errors.New("StatusCode: 412 something"), false}, + {"nil -> false", nil, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isPreconditionFailed(tt.err); got != tt.want { + t.Errorf("isPreconditionFailed = %v, want %v", got, tt.want) + } + }) + } +} + +// TestIsNotFound covers the typed-error and HTTP-status branches. +func TestIsNotFound(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want bool + }{ + {"NoSuchKey typed", &s3types.NoSuchKey{}, true}, + {"NoSuchBucket typed", &s3types.NoSuchBucket{}, true}, + {"NotFound typed", &s3types.NotFound{}, true}, + {"404 ResponseError", makeResponseErr(404, errors.New("nf")), true}, + {"500 ResponseError", makeResponseErr(500, errors.New("ise")), false}, + {"plain error", errors.New("random"), false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isNotFound(tt.err); got != tt.want { + t.Errorf("isNotFound = %v, want %v", got, tt.want) + } + }) + } +} + +// TestIsAuth covers both the typed APIError branch and the HTTP +// 401/403 status branch. +func TestIsAuth(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want bool + }{ + {"AccessDenied APIError", &fakeAPIError{code: "AccessDenied"}, true}, + {"InvalidAccessKeyId APIError", &fakeAPIError{code: "InvalidAccessKeyId"}, true}, + {"SignatureDoesNotMatch APIError", &fakeAPIError{code: "SignatureDoesNotMatch"}, true}, + {"403 ResponseError", makeResponseErr(403, errors.New("denied")), true}, + {"401 ResponseError", makeResponseErr(401, errors.New("unauth")), true}, + {"404 ResponseError", makeResponseErr(404, errors.New("nf")), false}, + {"500 ResponseError", makeResponseErr(500, errors.New("ise")), false}, + {"plain error", errors.New("auth?"), false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isAuth(tt.err); got != tt.want { + t.Errorf("isAuth = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/internal/orca/origin/azureblob/azureblob.go b/internal/orca/origin/azureblob/azureblob.go new file mode 100644 index 00000000..89406ed9 --- /dev/null +++ b/internal/orca/origin/azureblob/azureblob.go @@ -0,0 +1,369 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package azureblob is the Azure Blob Storage adapter for the Origin +// interface. Block Blobs only; PageBlob and AppendBlob are rejected +// at Head() with UnsupportedBlobTypeError. +package azureblob + +import ( + "context" + "errors" + "fmt" + "io" + "log/slog" + "net/http" + "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/bloberror" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container" + + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// Adapter implements origin.Origin against Azure Blob Storage. +type Adapter struct { + cfg config.Azureblob + client *azblob.Client + log *slog.Logger +} + +// New builds an Adapter from config. The log receives debug-level +// emissions for every Head / GetRange / List call and the error +// mapping decision (not-found / auth / precondition / unsupported +// blob type) on failure paths. Passing nil falls back to +// slog.Default(). +func New(cfg config.Azureblob, log *slog.Logger) (*Adapter, error) { + if cfg.Account == "" { + return nil, fmt.Errorf("azureblob: account required") + } + + if cfg.AccountKey == "" { + return nil, fmt.Errorf("azureblob: account_key required") + } + + cred, err := azblob.NewSharedKeyCredential(cfg.Account, cfg.AccountKey) + if err != nil { + return nil, fmt.Errorf("azureblob: shared-key credential: %w", err) + } + + endpoint := cfg.Endpoint + if endpoint == "" { + endpoint = fmt.Sprintf("https://%s.blob.core.windows.net/", cfg.Account) + } + + client, err := azblob.NewClientWithSharedKeyCredential(endpoint, cred, nil) + if err != nil { + return nil, fmt.Errorf("azureblob: client: %w", err) + } + + if log == nil { + log = slog.Default() + } + + return &Adapter{cfg: cfg, client: client, log: log}, nil +} + +// Head returns ObjectInfo for the named blob. +// +// "bucket" maps to the configured container; the bucket arg is honored +// only if non-empty (allowing single-container deployments to use the +// configured container as the default). +func (a *Adapter) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + cName := bucket + if cName == "" { + cName = a.cfg.Container + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_head_request", + slog.String("container", cName), + slog.String("key", key), + ) + + props, err := a.client.ServiceClient().NewContainerClient(cName). + NewBlobClient(key).GetProperties(ctx, nil) + if err != nil { + if isNotFound(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_head_not_found", + slog.String("container", cName), + slog.String("key", key), + ) + + return origin.ObjectInfo{LastStatus: http.StatusNotFound}, origin.ErrNotFound + } + + if isAuth(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_head_auth", + slog.String("container", cName), + slog.String("key", key), + ) + + return origin.ObjectInfo{}, origin.ErrAuth + } + + return origin.ObjectInfo{}, fmt.Errorf("azureblob head: %w", err) + } + + if err := validateBlobType(cName, key, props.BlobType); err != nil { + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_head_unsupported_blob_type", + slog.String("container", cName), + slog.String("key", key), + ) + + return origin.ObjectInfo{}, err + } + + info := origin.ObjectInfo{LastStatus: http.StatusOK} + if props.ContentLength != nil { + info.Size = *props.ContentLength + } + + if props.ETag != nil { + info.ETag = unwrapAzcoreETag(props.ETag) + } + + if props.ContentType != nil { + info.ContentType = *props.ContentType + } + + if props.LastModified != nil { + info.LastValidated = *props.LastModified + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_head_response", + slog.String("container", cName), + slog.String("key", key), + slog.Int64("size", info.Size), + slog.String("etag", origin.ETagShort(info.ETag)), + ) + + return info, nil +} + +// GetRange fetches [off, off+n) of the blob, sending If-Match: . +func (a *Adapter) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) { + cName := bucket + if cName == "" { + cName = a.cfg.Container + } + + bc := a.client.ServiceClient().NewContainerClient(cName).NewBlobClient(key) + opts := &azblob.DownloadStreamOptions{ + Range: blob.HTTPRange{Offset: off, Count: n}, + } + + if etag != "" { + // Azure (like S3) expects the entity-tag value in If-Match + // to be a quoted-string per RFC 7232. We strip the quotes + // on Head (a.cfg internal representation is unquoted) so + // re-wrap here at the point of egress, mirroring the + // awss3 driver. + etagVal := azcore.ETag("\"" + etag + "\"") + opts.AccessConditions = &blob.AccessConditions{ + ModifiedAccessConditions: &blob.ModifiedAccessConditions{ + IfMatch: to.Ptr(etagVal), + }, + } + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_get_range_request", + slog.String("container", cName), + slog.String("key", key), + slog.String("etag", origin.ETagShort(etag)), + slog.Int64("off", off), + slog.Int64("n", n), + ) + + resp, err := bc.DownloadStream(ctx, opts) + if err != nil { + if isPreconditionFailed(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_get_range_etag_changed", + slog.String("container", cName), + slog.String("key", key), + slog.String("want_etag", origin.ETagShort(etag)), + ) + + return nil, &origin.OriginETagChangedError{ + Bucket: cName, Key: key, Want: etag, + } + } + + if isNotFound(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_get_range_not_found", + slog.String("container", cName), + slog.String("key", key), + ) + + return nil, origin.ErrNotFound + } + + if isAuth(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_get_range_auth", + slog.String("container", cName), + slog.String("key", key), + ) + + return nil, origin.ErrAuth + } + + return nil, fmt.Errorf("azureblob get-range: %w", err) + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_get_range_response", + slog.String("container", cName), + slog.String("key", key), + ) + + return resp.Body, nil +} + +// List enumerates blobs in the container matching prefix. +func (a *Adapter) List(ctx context.Context, bucket, prefix, marker string, maxResults int) (origin.ListResult, error) { + cName := bucket + if cName == "" { + cName = a.cfg.Container + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_list_request", + slog.String("container", cName), + slog.String("prefix", prefix), + slog.String("marker", marker), + slog.Int("max", maxResults), + ) + + cc := a.client.ServiceClient().NewContainerClient(cName) + max := int32(maxResults) + pager := cc.NewListBlobsFlatPager(&container.ListBlobsFlatOptions{ + Prefix: &prefix, + MaxResults: &max, + Marker: stringOrNil(marker), + }) + out := origin.ListResult{} + + if pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + if isAuth(err) { + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_list_auth", + slog.String("container", cName), + ) + + return origin.ListResult{}, origin.ErrAuth + } + + return origin.ListResult{}, fmt.Errorf("azureblob list: %w", err) + } + + for _, item := range page.Segment.BlobItems { + entry := origin.ObjectEntry{} + if item.Name != nil { + entry.Key = *item.Name + } + + if item.Properties != nil { + if item.Properties.ContentLength != nil { + entry.Size = *item.Properties.ContentLength + } + + if item.Properties.ETag != nil { + entry.ETag = unwrapAzcoreETag(item.Properties.ETag) + } + + if item.Properties.BlobType != nil { + entry.BlobType = string(*item.Properties.BlobType) + } + } + + out.Entries = append(out.Entries, entry) + } + + if page.NextMarker != nil { + out.NextMarker = *page.NextMarker + out.IsTruncated = *page.NextMarker != "" + } + } + + a.log.LogAttrs(ctx, slog.LevelDebug, "azureblob_list_response", + slog.String("container", cName), + slog.Int("count", len(out.Entries)), + slog.Bool("truncated", out.IsTruncated), + ) + + return out, nil +} + +func stringOrNil(s string) *string { + if s == "" { + return nil + } + + return &s +} + +func isNotFound(err error) bool { + return bloberror.HasCode(err, bloberror.BlobNotFound) || + bloberror.HasCode(err, bloberror.ContainerNotFound) || + errors.Is(err, origin.ErrNotFound) +} + +func isAuth(err error) bool { + var rerr *azcore.ResponseError + if errors.As(err, &rerr) { + if rerr.StatusCode == http.StatusUnauthorized || rerr.StatusCode == http.StatusForbidden { + return true + } + } + + return bloberror.HasCode(err, bloberror.AuthenticationFailed) || + bloberror.HasCode(err, bloberror.AuthorizationFailure) +} + +func isPreconditionFailed(err error) bool { + var rerr *azcore.ResponseError + if errors.As(err, &rerr) && rerr.StatusCode == http.StatusPreconditionFailed { + return true + } + + return bloberror.HasCode(err, bloberror.ConditionNotMet) +} + +// validateBlobType returns an UnsupportedBlobTypeError for any +// non-Block-Blob type (Page or Append). PageBlob and AppendBlob's +// random-access-mutation model is incompatible with orca's chunked +// immutable cache contract, so they are unconditionally rejected +// here. Extracted as a pure function so unit tests can cover the +// branches without an Azurite round-trip. +func validateBlobType(container, key string, blobType *blob.BlobType) error { + if blobType == nil { + return nil + } + + if *blobType == blob.BlobTypeBlockBlob { + return nil + } + + return &origin.UnsupportedBlobTypeError{ + Bucket: container, + Key: key, + BlobType: string(*blobType), + } +} + +// unwrapAzcoreETag normalises an *azcore.ETag from the Azure SDK +// to the unquoted form orca uses internally. The Azure REST API +// returns entity tags as quoted-strings per RFC 7232; the SDK +// preserves the quotes, and orca strips them at the boundary so +// later If-Match egress (which re-wraps via the awss3 / azureblob +// drivers) doesn't double-quote. +func unwrapAzcoreETag(e *azcore.ETag) string { + if e == nil { + return "" + } + + return strings.Trim(string(*e), "\"") +} diff --git a/internal/orca/origin/azureblob/azureblob_test.go b/internal/orca/origin/azureblob/azureblob_test.go new file mode 100644 index 00000000..20e5fccf --- /dev/null +++ b/internal/orca/origin/azureblob/azureblob_test.go @@ -0,0 +1,201 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package azureblob + +import ( + "context" + "encoding/base64" + "errors" + "io" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob" + + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// TestValidateBlobType covers every branch of the unconditional +// block-blob-only enforcement. PageBlob and AppendBlob are always +// rejected; BlockBlob and the nil/unknown response shape pass. +func TestValidateBlobType(t *testing.T) { + pageBlob := blob.BlobTypePageBlob + appendBlob := blob.BlobTypeAppendBlob + blockBlob := blob.BlobTypeBlockBlob + + tests := []struct { + name string + blobType *blob.BlobType + wantUnsupported bool + }{ + {"nil blob type passes (no info to validate)", nil, false}, + {"block blob accepted", &blockBlob, false}, + {"page blob refused", &pageBlob, true}, + {"append blob refused", &appendBlob, true}, + } + + const ( + container = "ctr" + key = "key" + ) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateBlobType(container, key, tt.blobType) + + if (err != nil) != tt.wantUnsupported { + t.Fatalf("err=%v, wantUnsupported=%v", err, tt.wantUnsupported) + } + + if !tt.wantUnsupported { + return + } + + var ube *origin.UnsupportedBlobTypeError + if !errors.As(err, &ube) { + t.Fatalf("err type=%T (want *origin.UnsupportedBlobTypeError): %v", err, err) + } + + if ube.Bucket != container { + t.Errorf("Bucket=%q want %q", ube.Bucket, container) + } + + if ube.Key != key { + t.Errorf("Key=%q want %q", ube.Key, key) + } + + if tt.blobType != nil && ube.BlobType != string(*tt.blobType) { + t.Errorf("BlobType=%q want %q", ube.BlobType, string(*tt.blobType)) + } + }) + } +} + +// TestValidateBlobType_NonBlockBlob_AlwaysRejected is the regression +// test for the fix that removed the user-overridable +// EnforceBlockBlobOnly flag. There is no longer any code path that +// accepts a Page or Append blob. +func TestValidateBlobType_NonBlockBlob_AlwaysRejected(t *testing.T) { + pageBlob := blob.BlobTypePageBlob + + if err := validateBlobType("ctr", "key", &pageBlob); err == nil { + t.Fatalf("page blob accepted; want UnsupportedBlobTypeError") + } + + appendBlob := blob.BlobTypeAppendBlob + if err := validateBlobType("ctr", "key", &appendBlob); err == nil { + t.Fatalf("append blob accepted; want UnsupportedBlobTypeError") + } +} + +// TestGetRange_QuotesIfMatchHeader verifies that the If-Match header +// emitted on a conditional GetRange is the etag value wrapped in +// double quotes per RFC 7232. The internal representation strips +// quotes on Head (drivers normalise to unquoted), so this is the +// re-wrap point on egress. Without the wrap an upstream that +// strictly enforces RFC 7232 entity-tag syntax would reject the +// precondition or treat it as never-matched. +func TestGetRange_QuotesIfMatchHeader(t *testing.T) { + t.Parallel() + + const etagUnquoted = "0x8DDCAFE00000000" + + var captured atomic.Value // string + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + captured.Store(r.Header.Get("If-Match")) + // Respond with the requested bytes. The exact body is not + // validated by this test - only the inbound If-Match header + // is. A small synthetic body keeps the SDK happy. + w.Header().Set("Content-Length", "4") + w.Header().Set("Content-Type", "application/octet-stream") + w.Header().Set("ETag", "\""+etagUnquoted+"\"") + w.WriteHeader(http.StatusPartialContent) + _, _ = w.Write([]byte("test")) //nolint:errcheck // best-effort test write + })) + + t.Cleanup(srv.Close) + // Azurite uses the account name as the URL path component. We + // mirror that shape so the SDK signs/issues requests in the + // expected layout. + cfg := config.Azureblob{ + Account: "devstoreaccount1", + AccountKey: base64.StdEncoding.EncodeToString([]byte("test-shared-key-placeholder--32b")), + Container: "ctr", + Endpoint: srv.URL + "/devstoreaccount1", + } + + a, err := New(cfg, nil) + if err != nil { + t.Fatalf("azureblob.New: %v", err) + } + + body, err := a.GetRange(context.Background(), "ctr", "key", etagUnquoted, 0, 4) + if err != nil { + t.Fatalf("GetRange: %v", err) + } + + defer body.Close() //nolint:errcheck // test cleanup + + if _, err := io.ReadAll(body); err != nil { + t.Fatalf("read body: %v", err) + } + + got, _ := captured.Load().(string) + + want := "\"" + etagUnquoted + "\"" + if got != want { + t.Errorf("If-Match=%q want %q", got, want) + } +} + +// TestGetRange_OmitsIfMatchWhenEtagEmpty verifies that the If-Match +// header is not sent at all when the caller supplies an empty etag. +// Sending an empty If-Match would either be a malformed precondition +// or evaluate as never-matching depending on server interpretation. +func TestGetRange_OmitsIfMatchWhenEtagEmpty(t *testing.T) { + t.Parallel() + + var captured atomic.Value // string + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Record presence/absence; empty string here means "header + // was absent". + captured.Store(r.Header.Get("If-Match")) + w.Header().Set("Content-Length", "4") + w.WriteHeader(http.StatusPartialContent) + _, _ = w.Write([]byte("test")) //nolint:errcheck // best-effort test write + })) + + t.Cleanup(srv.Close) + + cfg := config.Azureblob{ + Account: "devstoreaccount1", + AccountKey: base64.StdEncoding.EncodeToString([]byte("test-shared-key-placeholder--32b")), + Container: "ctr", + Endpoint: srv.URL + "/devstoreaccount1", + } + + a, err := New(cfg, nil) + if err != nil { + t.Fatalf("azureblob.New: %v", err) + } + + body, err := a.GetRange(context.Background(), "ctr", "key", "", 0, 4) + if err != nil { + t.Fatalf("GetRange: %v", err) + } + + defer body.Close() //nolint:errcheck // test cleanup + + _, _ = io.ReadAll(body) //nolint:errcheck // test cleanup + + got, _ := captured.Load().(string) + if got != "" { + t.Errorf("If-Match present (%q) when etag was empty; want absent", got) + } +} diff --git a/internal/orca/origin/origin.go b/internal/orca/origin/origin.go new file mode 100644 index 00000000..326c8884 --- /dev/null +++ b/internal/orca/origin/origin.go @@ -0,0 +1,133 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package origin defines the upstream-blob-store interface and shared +// types. Concrete adapters live under origin//. +package origin + +import ( + "context" + "errors" + "fmt" + "io" + "time" +) + +// Origin is a read-only view of an upstream blob store. +type Origin interface { + // Head returns object metadata. If the blob does not exist, returns + // ErrNotFound. If the blob is an unsupported type (e.g., azureblob + // non-BlockBlob), returns UnsupportedBlobTypeError. + Head(ctx context.Context, bucket, key string) (ObjectInfo, error) + + // GetRange fetches [off, off+n) bytes of the object. The etag is + // passed as `If-Match: ` so a mid-flight overwrite is detected + // at the wire (returns OriginETagChangedError). + GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) + + // List enumerates objects under prefix. Pagination via marker. + List(ctx context.Context, bucket, prefix, marker string, max int) (ListResult, error) +} + +// ObjectInfo is the result of a successful Head. +type ObjectInfo struct { + Size int64 + ETag string + ContentType string + LastValidated time.Time + LastStatus int +} + +// ListResult is the paginated result of List. +type ListResult struct { + Entries []ObjectEntry + NextMarker string + IsTruncated bool +} + +// ObjectEntry is one item in a ListResult. +type ObjectEntry struct { + Key string + Size int64 + ETag string + BlobType string // "" for s3; "BlockBlob" / "PageBlob" / "AppendBlob" for azureblob +} + +// Sentinel errors. Wrap with %w so callers use errors.Is. +// +// Driver contract: +// +// - ErrNotFound: blob does not exist. AWS S3 driver returns this for +// NoSuchKey responses; the azureblob driver for BlobNotFound / +// ContainerNotFound. +// - ErrAuth: 401 / 403. AWS S3 driver returns this for AccessDenied +// and similar; the azureblob driver for HTTP 401/403 and the +// AuthenticationFailed / AuthorizationFailure codes. +// +// New drivers should map their SDK-specific not-found and auth +// indicators onto these sentinels so handlers can route consistently +// via errors.Is. +var ( + ErrNotFound = errors.New("origin: not found") + ErrAuth = errors.New("origin: auth") +) + +// OriginETagChangedError is returned by GetRange when the origin +// rejects the If-Match precondition. +type OriginETagChangedError struct { + Bucket string + Key string + Want string +} + +func (e *OriginETagChangedError) Error() string { + return fmt.Sprintf("origin etag changed for %s/%s: want=%q", + e.Bucket, e.Key, e.Want) +} + +// UnsupportedBlobTypeError is returned by azureblob.Head when the +// target is a Page or Append blob. Orca only serves Block Blobs. +type UnsupportedBlobTypeError struct { + Bucket string + Key string + BlobType string +} + +func (e *UnsupportedBlobTypeError) Error() string { + return fmt.Sprintf("origin unsupported blob type %s for %s/%s", + e.BlobType, e.Bucket, e.Key) +} + +// MissingETagError is returned by the fetch coordinator when an +// origin Head response carries an empty ETag. chunk.Path encodes the +// ETag in its hash input; a stable cache key requires the origin to +// supply one. Misconfigured backends (some S3-compatible +// implementations with specific bucket policies, custom origins not +// following the AWS/Azure contract) can omit ETags, in which case +// two different versions of the same (bucket, key) would alias to +// the same chunk.Path and orca would silently serve stale bytes. +// Rejecting at Head time surfaces the misconfiguration immediately +// instead of after observable corruption. +type MissingETagError struct { + Bucket string + Key string +} + +func (e *MissingETagError) Error() string { + return fmt.Sprintf("origin returned empty ETag for %s/%s; orca requires versioned origins", + e.Bucket, e.Key) +} + +// ETagShort returns the first 8 characters of an unquoted ETag for +// log/debug emissions. ETags are not secrets but they're long enough +// to make log lines hard to read; the prefix is sufficient for +// matching one fill against another. Returns the input unchanged +// when shorter than 8 chars. +func ETagShort(etag string) string { + const n = 8 + if len(etag) <= n { + return etag + } + + return etag[:n] +} diff --git a/internal/orca/origin/origin_test.go b/internal/orca/origin/origin_test.go new file mode 100644 index 00000000..f9f1f85d --- /dev/null +++ b/internal/orca/origin/origin_test.go @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package origin + +import "testing" + +// TestETagShort covers the truncation contract: ETags 8 characters or +// shorter pass through unchanged; longer ETags are truncated to the +// first 8 characters. The truncation is for log-line readability only; +// callers must not use the short form as a precondition value. +func TestETagShort(t *testing.T) { + t.Parallel() + + tests := []struct { + in string + want string + }{ + {"", ""}, + {"abc", "abc"}, + {"01234567", "01234567"}, + {"012345678", "01234567"}, + {"0x8DDCAFE00000000ABCDEF", "0x8DDCAF"}, + } + + for _, tt := range tests { + got := ETagShort(tt.in) + if got != tt.want { + t.Errorf("ETagShort(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} diff --git a/internal/orca/server/server.go b/internal/orca/server/server.go new file mode 100644 index 00000000..1cf51d8c --- /dev/null +++ b/internal/orca/server/server.go @@ -0,0 +1,975 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package server holds the HTTP handlers for the client edge and the +// internal-listener. +// +// Client edge (8443): GET /{bucket}/{key} (with optional Range), HEAD, +// LIST. No auth in dev (server.auth.enabled=false). +// +// Internal listener (8444): GET /internal/fill?. No mTLS in +// dev (cluster.internal_tls.enabled=false). +package server + +import ( + "bufio" + "context" + "encoding/xml" + "errors" + "fmt" + "io" + "log/slog" + "net/http" + "strconv" + "strings" + + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/cluster" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// EdgeHandler implements the client-edge S3 surface. +type EdgeHandler struct { + fc edgeFetchAPI + cfg *config.Config + log *slog.Logger +} + +// edgeFetchAPI is the surface area EdgeHandler depends on. The real +// *fetch.Coordinator satisfies it; tests substitute small fakes for +// deterministic unit-level coverage. +type edgeFetchAPI interface { + HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) + GetChunk(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) + Origin() origin.Origin +} + +// NewEdgeHandler wires the edge handler. +func NewEdgeHandler(fc edgeFetchAPI, cfg *config.Config, log *slog.Logger) *EdgeHandler { + return &EdgeHandler{fc: fc, cfg: cfg, log: log} +} + +// ServeHTTP routes incoming client requests. +// +// Routing (path-style only, since LocalStack and most dev clients +// use path-style): +// +// GET / -> ListBuckets (not supported; 405) +// GET /{bucket}/?list-type=2&prefix=... -> ListObjectsV2 +// GET /{bucket}/ -> ListObjectsV2 (default) +// GET /{bucket}/{key} -> GetObject (with optional Range) +// HEAD /{bucket}/{key} -> HeadObject +// HEAD /{bucket}/ -> HeadBucket (not supported; 405) +func (h *EdgeHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if h.cfg.Server.Auth.Enabled { + // Stub: production would dispatch to bearer/mTLS validation. + // In dev (auth.enabled=false) we skip entirely. + http.Error(w, "auth required (server.auth.enabled=true) but not implemented in MVP", + http.StatusUnauthorized) + + return + } + + bucket, key := splitPath(r.URL.Path) + + h.log.LogAttrs(r.Context(), slog.LevelDebug, "edge_request", + slog.String("method", r.Method), + slog.String("path", r.URL.Path), + slog.String("bucket", bucket), + slog.String("key", key), + slog.String("range", r.Header.Get("Range")), + slog.String("remote", r.RemoteAddr), + ) + + switch r.Method { + case http.MethodHead: + if key == "" { + h.notImplemented(w, "HeadBucket") + return + } + + h.handleHead(w, r, bucket, key) + case http.MethodGet: + if key == "" { + h.handleList(w, r, bucket) + return + } + + h.handleGet(w, r, bucket, key) + default: + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + } +} + +func (h *EdgeHandler) handleHead(w http.ResponseWriter, r *http.Request, bucket, key string) { + info, err := h.fc.HeadObject(r.Context(), bucket, key) + if err != nil { + h.writeOriginError(w, err) + return + } + + h.log.LogAttrs(r.Context(), slog.LevelDebug, "edge_head_response", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("size", info.Size), + slog.String("etag", origin.ETagShort(info.ETag)), + ) + + setObjectHeaders(w, info) + // HEAD must report the Content-Length the GET response would carry. + w.Header().Set("Content-Length", strconv.FormatInt(info.Size, 10)) + w.WriteHeader(http.StatusOK) +} + +func (h *EdgeHandler) handleGet(w http.ResponseWriter, r *http.Request, bucket, key string) { + info, err := h.fc.HeadObject(r.Context(), bucket, key) + if err != nil { + h.writeOriginError(w, err) + return + } + + // Zero-byte objects short-circuit to 200 + empty body. The normal + // flow below would compute rangeEnd = info.Size - 1 = -1 and fall + // into the rangeStart > rangeEnd guard, returning a spurious 416 + // for what should be a successful empty-body fetch. Any Range + // request against a zero-byte object is genuinely unsatisfiable + // and remains a 416 (RFC 7233). + if info.Size == 0 { + if r.Header.Get("Range") != "" { + http.Error(w, "range not satisfiable", http.StatusRequestedRangeNotSatisfiable) + return + } + + setObjectHeaders(w, info) + w.Header().Set("Content-Length", "0") + w.WriteHeader(http.StatusOK) + + h.log.LogAttrs(r.Context(), slog.LevelDebug, "edge_get_empty_object", + slog.String("bucket", bucket), + slog.String("key", key), + ) + + return + } + + // Determine byte range. + var ( + rangeStart int64 + rangeEnd = info.Size - 1 + hasRange bool + statusCode = http.StatusOK + ) + if rh := r.Header.Get("Range"); rh != "" { + s, e, ok := parseSimpleByteRange(rh, info.Size) + if !ok { + http.Error(w, "invalid Range", http.StatusRequestedRangeNotSatisfiable) + return + } + + rangeStart, rangeEnd = s, e + hasRange = true + statusCode = http.StatusPartialContent + } + + if rangeStart > rangeEnd { + http.Error(w, "range not satisfiable", http.StatusRequestedRangeNotSatisfiable) + return + } + + chunkSize := chunk.SizeFor(info.Size, h.cfg.Chunking.Size, h.cfg.Chunking.AsChunkTiers()) + firstChunk, lastChunk := chunk.IndexRange(rangeStart, rangeEnd, chunkSize, info.Size) + + h.log.LogAttrs(r.Context(), slog.LevelDebug, "edge_get_plan", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("range_start", rangeStart), + slog.Int64("range_end", rangeEnd), + slog.Int64("first_chunk", firstChunk), + slog.Int64("last_chunk", lastChunk), + slog.Int64("chunk_size", chunkSize), + slog.Bool("has_range", hasRange), + ) + + // Fetch the first chunk before committing any response headers + // so that origin errors (404, auth, timeout, mid-stream blob + // fault) surface as a clean S3-style error response instead of + // a half-written 200 followed by a dropped connection. Once the + // first byte is in hand we know the rest of the stream is + // "tentatively" healthy; subsequent chunk failures remain + // mid-stream aborts. + firstKey := chunk.Key{ + OriginID: h.cfg.Origin.ID, + Bucket: bucket, + ObjectKey: key, + ETag: info.ETag, + ChunkSize: chunkSize, + Index: firstChunk, + } + + firstBody, err := h.fc.GetChunk(r.Context(), firstKey, info.Size) + if err != nil { + h.writeOriginError(w, err) + return + } + // Peek a single byte to drain any first-read errors from the + // underlying body (e.g. cachestore-backed bodies can fail on the + // first network read). io.EOF on peek is acceptable for the + // degenerate empty-chunk case. + firstReader := bufio.NewReader(firstBody) + if _, err := firstReader.Peek(1); err != nil && !errors.Is(err, io.EOF) { + firstBody.Close() //nolint:errcheck // closing on error path + h.writeOriginError(w, err) + + return + } + + // Set headers eagerly. The response headers are committed below + // once the first chunk has been confirmed readable; thereafter + // any failure becomes a mid-stream abort. + setObjectHeaders(w, info) + w.Header().Set("Content-Length", strconv.FormatInt(rangeEnd-rangeStart+1, 10)) + + if hasRange { + w.Header().Set("Content-Range", + fmt.Sprintf("bytes %d-%d/%d", rangeStart, rangeEnd, info.Size)) + } + // Write status now; subsequent failures become mid-stream aborts. + w.WriteHeader(statusCode) + + // Stream the first chunk's slice. Any failure here is now a + // mid-stream abort (headers are committed). + off, length := chunk.ChunkSlice(firstChunk, chunkSize, rangeStart, rangeEnd, info.Size) + if err := streamSlice(w, firstReader, off, length); err != nil { + firstBody.Close() //nolint:errcheck // body close best-effort, response already streaming + h.log.LogAttrs(r.Context(), slog.LevelWarn, "mid-stream copy failed", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("chunk", firstChunk), + slog.Any("err", err), + ) + + return + } + + firstBody.Close() //nolint:errcheck // body close best-effort, response already streaming + + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + + if firstChunk < lastChunk { + h.streamRemainingChunks(r.Context(), w, bucket, key, info, chunkSize, + rangeStart, rangeEnd, firstChunk+1, lastChunk) + } + + h.log.LogAttrs(r.Context(), slog.LevelDebug, "edge_get_complete", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("bytes", rangeEnd-rangeStart+1), + ) +} + +// streamRemainingChunks fetches and streams chunks [firstIdx, lastIdx] +// after the first chunk has already been delivered. Honors the +// configured Chunking.Readahead depth: with depth > 0 a producer +// goroutine prefetches up to depth chunks while the consumer streams +// the current one; with depth == 0 the loop is strictly sequential +// (zero-overhead opt-out preserving the pre-readahead behavior). +// +// All failures here are mid-stream aborts: response headers are +// already committed, so the only remedy is logging and returning. +func (h *EdgeHandler) streamRemainingChunks( + ctx context.Context, + w http.ResponseWriter, + bucket, key string, + info origin.ObjectInfo, + chunkSize, rangeStart, rangeEnd int64, + firstIdx, lastIdx int64, +) { + depth := h.cfg.Chunking.ReadaheadDepth() + if depth <= 0 { + h.streamRemainingChunksSequential(ctx, w, bucket, key, info, chunkSize, + rangeStart, rangeEnd, firstIdx, lastIdx) + + return + } + + h.streamRemainingChunksReadahead(ctx, w, bucket, key, info, chunkSize, + rangeStart, rangeEnd, firstIdx, lastIdx, depth) +} + +// streamRemainingChunksSequential is the pre-readahead loop body: +// fetch chunk N, stream it, close it, advance. One in-flight chunk +// fetch at a time. Used when Chunking.Readahead is 0. +func (h *EdgeHandler) streamRemainingChunksSequential( + ctx context.Context, + w http.ResponseWriter, + bucket, key string, + info origin.ObjectInfo, + chunkSize, rangeStart, rangeEnd int64, + firstIdx, lastIdx int64, +) { + for ci := firstIdx; ci <= lastIdx; ci++ { + ckey := chunk.Key{ + OriginID: h.cfg.Origin.ID, + Bucket: bucket, + ObjectKey: key, + ETag: info.ETag, + ChunkSize: chunkSize, + Index: ci, + } + + h.log.LogAttrs(ctx, slog.LevelDebug, "edge_get_chunk_next", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("chunk", ci), + ) + + body, err := h.fc.GetChunk(ctx, ckey, info.Size) + if err != nil { + h.log.LogAttrs(ctx, slog.LevelWarn, "mid-stream chunk fetch failed", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("chunk", ci), + slog.Any("err", err), + ) + + return + } + + off, length := chunk.ChunkSlice(ci, chunkSize, rangeStart, rangeEnd, info.Size) + if err := streamSlice(w, body, off, length); err != nil { + body.Close() //nolint:errcheck // chunk body close best-effort, response already streaming + h.log.LogAttrs(ctx, slog.LevelWarn, "mid-stream copy failed", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("chunk", ci), + slog.Any("err", err), + ) + + return + } + + body.Close() //nolint:errcheck // chunk body close best-effort, response already streaming + + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + } +} + +// pendingChunk is one item produced by the readahead pipeline: an +// in-order chunk body (or the error that prevented fetching it). +// The consumer is responsible for Close()ing rc when non-nil. +type pendingChunk struct { + idx int64 + rc io.ReadCloser + err error +} + +// readaheadJob is a chunk-fetch slot held in the dispatcher's queue. +// Each job owns a 1-buffered result channel that its worker writes +// to exactly once before exiting. +type readaheadJob struct { + idx int64 + rc chan pendingChunk +} + +// streamRemainingChunksReadahead runs a producer goroutine that +// fetches chunks ahead into a bounded channel of capacity depth, +// while the main goroutine streams the current chunk to the client. +// This hides per-chunk cachestore RTT behind body transfer time so +// large-blob GETs no longer pay N strictly-serial round trips. +// +// Lifecycle: +// - Consumer aborts (mid-stream copy failure, fetch error, +// producer-channel closed early) cancel the producer's context; +// the producer drains and closes any bodies it has already +// prefetched on the way out. +// - Producer panics are recovered, logged, and surface to the +// consumer as an early channel close; the consumer treats that +// as a mid-stream abort and returns cleanly. +// - Context cancellation from the caller (client disconnect) +// propagates through prefetchCtx, cancelling in-flight +// GetChunk calls and causing the producer to exit. +func (h *EdgeHandler) streamRemainingChunksReadahead( + ctx context.Context, + w http.ResponseWriter, + bucket, key string, + info origin.ObjectInfo, + chunkSize, rangeStart, rangeEnd int64, + firstIdx, lastIdx int64, + depth int, +) { + prefetchCtx, cancelPrefetch := context.WithCancel(ctx) + defer cancelPrefetch() + + ch := h.prefetchChunks(prefetchCtx, bucket, key, info.ETag, chunkSize, info.Size, + firstIdx, lastIdx, depth) + + // Drain helper: close any pending bodies left in the channel + // after we decide to abort. The producer's own deferred + // per-pending close (on ctx cancel during send-select) covers + // the in-flight body it is currently fetching; this loop covers + // the buffered ones the consumer never reaches. + drain := func() { + for p := range ch { + if p.rc != nil { + _ = p.rc.Close() //nolint:errcheck // drain best-effort + } + } + } + + expectedIdx := firstIdx + + for p := range ch { + if p.err != nil { + if p.rc != nil { + _ = p.rc.Close() //nolint:errcheck // close error-path body + } + + h.log.LogAttrs(ctx, slog.LevelWarn, "mid-stream chunk fetch failed", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("chunk", p.idx), + slog.Any("err", p.err), + ) + cancelPrefetch() + drain() + + return + } + + if p.idx != expectedIdx { + // Defensive: producer is required to deliver chunks in + // index order. A mismatch indicates a programming error + // upstream; treat as mid-stream abort. + if p.rc != nil { + _ = p.rc.Close() //nolint:errcheck + } + + h.log.LogAttrs(ctx, slog.LevelError, "readahead order violation", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("expected", expectedIdx), + slog.Int64("got", p.idx), + ) + cancelPrefetch() + drain() + + return + } + + h.log.LogAttrs(ctx, slog.LevelDebug, "edge_get_chunk_next", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("chunk", p.idx), + ) + + off, length := chunk.ChunkSlice(p.idx, chunkSize, rangeStart, rangeEnd, info.Size) + if err := streamSlice(w, p.rc, off, length); err != nil { + _ = p.rc.Close() //nolint:errcheck + h.log.LogAttrs(ctx, slog.LevelWarn, "mid-stream copy failed", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("chunk", p.idx), + slog.Any("err", err), + ) + cancelPrefetch() + drain() + + return + } + + _ = p.rc.Close() //nolint:errcheck + + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + + expectedIdx++ + } + + if expectedIdx <= lastIdx { + // Channel closed before all chunks were delivered. The + // producer either panicked (already logged) or its context + // was cancelled (client disconnect or earlier mid-stream + // abort - the latter would have returned above). Surface as + // a mid-stream warning so operators see truncated responses. + h.log.LogAttrs(ctx, slog.LevelWarn, "readahead truncated response", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("expected_through", lastIdx), + slog.Int64("delivered_through", expectedIdx-1), + ) + } +} + +// prefetchChunks fetches chunks [firstIdx, lastIdx] into a bounded +// channel of capacity depth, with up to depth fetches in flight in +// parallel. Bodies are delivered in chunk-index order so the +// consumer can stream them straight to the client without +// reassembly. Caller drains the channel and owns Close() for any +// non-nil rc it receives. +// +// Fan-out model: +// - A dispatcher goroutine spawns one worker goroutine per chunk +// index, gated by a depth-sized job queue so peak in-flight +// workers stays at depth (+ at most one in-flight push and one +// in-flight delivery). +// - Each worker calls h.fc.GetChunk for its chunk and writes the +// result to a per-job, 1-buffered result channel. +// - The dispatcher pushes job descriptors onto the queue in +// chunk-index order so the delivery loop reads results in that +// same order. +// +// Lifecycle: +// - All workers ALWAYS write exactly once to their result channel +// before exiting. This invariant lets the delivery loop block +// on `<-j.rc` without risk of deadlock even on ctx-cancel. +// - On ctx cancellation the dispatcher drains its currently-spawned +// worker (waiting for the unconditional rc write) and exits. +// The delivery loop drains any remaining queued jobs the same +// way, closing the body in each result. +// - Producer panics are recovered, logged, and surface to the +// consumer as an early channel close; the consumer treats that +// as a mid-stream abort. +func (h *EdgeHandler) prefetchChunks( + ctx context.Context, + bucket, key, etag string, + chunkSize, objectSize int64, + firstIdx, lastIdx int64, + depth int, +) <-chan pendingChunk { + out := make(chan pendingChunk, depth) + + queue := make(chan readaheadJob, depth) + + // Dispatcher: spawn workers in chunk-index order, gated by the + // queue's capacity. Each worker is independent and runs to + // completion (always writes its result), so the dispatcher + // doesn't need to track them after spawning. + go func() { + defer close(queue) + defer func() { + if r := recover(); r != nil { + h.log.LogAttrs(ctx, slog.LevelError, "readahead dispatcher panic", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Any("panic", r), + ) + } + }() + + for ci := firstIdx; ci <= lastIdx; ci++ { + if err := ctx.Err(); err != nil { + return + } + + rc := make(chan pendingChunk, 1) + + // Spawn worker first so the result channel always + // receives a write, even if ctx is cancelled while we + // block on the queue push below. The worker's + // GetChunk call will short-circuit on a cancelled ctx + // with err != nil and rc == nil, satisfying the + // "always write" invariant. + go func(idx int64, rc chan<- pendingChunk) { + defer func() { + if r := recover(); r != nil { + h.log.LogAttrs(ctx, slog.LevelError, "readahead worker panic", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Int64("chunk", idx), + slog.Any("panic", r), + ) + // Preserve the write-once invariant: send a + // synthetic error so the delivery loop sees + // the panic-affected chunk as a fetch error + // rather than blocking forever on rc. + rc <- pendingChunk{idx: idx, err: fmt.Errorf("readahead worker panic: %v", r)} + } + }() + + ckey := chunk.Key{ + OriginID: h.cfg.Origin.ID, + Bucket: bucket, + ObjectKey: key, + ETag: etag, + ChunkSize: chunkSize, + Index: idx, + } + + body, err := h.fc.GetChunk(ctx, ckey, objectSize) + rc <- pendingChunk{idx: idx, rc: body, err: err} + }(ci, rc) + + select { + case queue <- readaheadJob{idx: ci, rc: rc}: + case <-ctx.Done(): + // Worker is in flight; drain it so the body (if any) + // is closed and the goroutine doesn't leak. + p := <-rc + if p.rc != nil { + _ = p.rc.Close() //nolint:errcheck // ctx-cancel body close best-effort + } + + return + } + } + }() + + // Delivery: read worker results in chunk-index order and forward + // to `out`. Drains in-flight jobs on ctx-cancel. + go func() { + defer close(out) + defer func() { + if r := recover(); r != nil { + h.log.LogAttrs(ctx, slog.LevelError, "readahead delivery panic", + slog.String("bucket", bucket), + slog.String("key", key), + slog.Any("panic", r), + ) + } + }() + + for j := range queue { + p := <-j.rc // worker always writes; safe blocking read + + if err := ctx.Err(); err != nil { + if p.rc != nil { + _ = p.rc.Close() //nolint:errcheck // drain best-effort + } + + drainQueue(queue) + + return + } + + select { + case out <- p: + case <-ctx.Done(): + if p.rc != nil { + _ = p.rc.Close() //nolint:errcheck // drain best-effort + } + + drainQueue(queue) + + return + } + } + }() + + return out +} + +// drainQueue is a helper that empties any remaining job descriptors +// from the readahead queue, waits for each spawned worker to deliver +// its result, and closes any body the result carries. Used on +// ctx-cancel cleanup paths so worker goroutines and cachestore +// response bodies do not leak when the consumer aborts mid-stream. +func drainQueue(queue <-chan readaheadJob) { + for j := range queue { + p := <-j.rc + if p.rc != nil { + _ = p.rc.Close() //nolint:errcheck // cleanup best-effort + } + } +} + +// streamSlice copies length bytes starting at off from src to dst. +func streamSlice(dst io.Writer, src io.Reader, off, length int64) error { + if off > 0 { + if _, err := io.CopyN(io.Discard, src, off); err != nil { + return err + } + } + + if length > 0 { + if _, err := io.CopyN(dst, src, length); err != nil { + return err + } + } + + return nil +} + +// handleList is a thin pass-through to Origin.List for v1 prototype. +func (h *EdgeHandler) handleList(w http.ResponseWriter, r *http.Request, bucket string) { + // Pass-through; very minimal S3 ListObjectsV2 shape. Reviewers can + // curl this for sanity but full S3 list semantics are not in MVP. + prefix := r.URL.Query().Get("prefix") + marker := r.URL.Query().Get("continuation-token") + maxStr := r.URL.Query().Get("max-keys") + maxKeys := 1000 + + if maxStr != "" { + if v, err := strconv.Atoi(maxStr); err == nil && v > 0 { + maxKeys = v + } + } + + type listEntry struct { + Key string `xml:"Key"` + Size int64 `xml:"Size"` + ETag string `xml:"ETag"` + } + + type listResult struct { + XMLName xml.Name `xml:"ListBucketResult"` + Name string `xml:"Name"` + Prefix string `xml:"Prefix"` + KeyCount int `xml:"KeyCount"` + MaxKeys int `xml:"MaxKeys"` + IsTruncated bool `xml:"IsTruncated"` + NextMarker string `xml:"NextContinuationToken,omitempty"` + Contents []listEntry `xml:"Contents"` + } + + or := h.fc.Origin() + + res, err := or.List(r.Context(), bucket, prefix, marker, maxKeys) + if err != nil { + h.writeOriginError(w, err) + return + } + + body := listResult{ + Name: bucket, + Prefix: prefix, + KeyCount: len(res.Entries), + MaxKeys: maxKeys, + IsTruncated: res.IsTruncated, + NextMarker: res.NextMarker, + } + for _, e := range res.Entries { + body.Contents = append(body.Contents, listEntry{Key: e.Key, Size: e.Size, ETag: e.ETag}) + } + + w.Header().Set("Content-Type", "application/xml") + w.WriteHeader(http.StatusOK) + enc := xml.NewEncoder(w) + + if err := enc.Encode(body); err != nil { + // Headers already sent; we cannot change the status. Log so + // truncated / malformed LIST responses are visible, matching + // the mid-stream warn-level treatment in the GET path. + h.log.LogAttrs(r.Context(), slog.LevelWarn, "list xml encode failed", + slog.String("bucket", bucket), + slog.String("prefix", prefix), + slog.Any("err", err), + ) + } +} + +func (h *EdgeHandler) notImplemented(w http.ResponseWriter, op string) { + http.Error(w, op+" not implemented in MVP", http.StatusNotImplemented) +} + +func (h *EdgeHandler) writeOriginError(w http.ResponseWriter, err error) { + switch { + case errors.Is(err, origin.ErrNotFound): + http.Error(w, "NoSuchKey", http.StatusNotFound) + case errors.Is(err, origin.ErrAuth): + http.Error(w, "Unauthorized origin", http.StatusBadGateway) + default: + var ( + ube *origin.UnsupportedBlobTypeError + ec *origin.OriginETagChangedError + mte *origin.MissingETagError + ) + + switch { + case errors.As(err, &ube): + http.Error(w, "OriginUnsupported: "+ube.Error(), http.StatusBadGateway) + case errors.As(err, &ec): + http.Error(w, "OriginETagChanged", http.StatusBadGateway) + case errors.As(err, &mte): + http.Error(w, "OriginMissingETag: "+mte.Error(), http.StatusBadGateway) + default: + h.log.LogAttrs(context.Background(), slog.LevelWarn, "origin error", + slog.Any("err", err), + ) + http.Error(w, "OriginUnreachable", http.StatusBadGateway) + } + } +} + +func setObjectHeaders(w http.ResponseWriter, info origin.ObjectInfo) { + if info.ContentType != "" { + w.Header().Set("Content-Type", info.ContentType) + } + + if info.ETag != "" { + w.Header().Set("ETag", "\""+info.ETag+"\"") + } + + w.Header().Set("Accept-Ranges", "bytes") +} + +func splitPath(p string) (bucket, key string) { + p = strings.TrimPrefix(p, "/") + if p == "" { + return "", "" + } + + idx := strings.IndexByte(p, '/') + if idx < 0 { + return p, "" + } + + return p[:idx], p[idx+1:] +} + +func parseSimpleByteRange(h string, size int64) (start, end int64, ok bool) { + if !strings.HasPrefix(h, "bytes=") { + return 0, 0, false + } + + spec := strings.TrimPrefix(h, "bytes=") + + parts := strings.Split(spec, "-") + if len(parts) != 2 { + return 0, 0, false + } + + if parts[0] == "" { + // Suffix: -N (last N bytes) + n, err := strconv.ParseInt(parts[1], 10, 64) + if err != nil || n <= 0 || n > size { + return 0, 0, false + } + + return size - n, size - 1, true + } + + s, err := strconv.ParseInt(parts[0], 10, 64) + if err != nil || s < 0 { + return 0, 0, false + } + + if parts[1] == "" { + return s, size - 1, true + } + + e, err := strconv.ParseInt(parts[1], 10, 64) + if err != nil || e < s { + return 0, 0, false + } + + if e >= size { + e = size - 1 + } + + return s, e, true +} + +// InternalHandler implements GET /internal/fill on the internal +// listener. Plain HTTP/2 (no mTLS) in dev. +type InternalHandler struct { + fc internalFetchAPI + cl *cluster.Cluster + log *slog.Logger +} + +// internalFetchAPI is the surface area InternalHandler depends on. The +// real *fetch.Coordinator satisfies it; tests substitute small fakes. +type internalFetchAPI interface { + FillForPeer(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) +} + +// NewInternalHandler wires the internal handler. +func NewInternalHandler(fc internalFetchAPI, cl *cluster.Cluster, log *slog.Logger) *InternalHandler { + return &InternalHandler{fc: fc, cl: cl, log: log} +} + +// ServeHTTP handles GET /internal/fill?. +func (h *InternalHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/internal/fill" { + http.NotFound(w, r) + return + } + + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + if r.Header.Get("X-Orca-Internal") != "1" { + http.Error(w, "missing X-Orca-Internal header", http.StatusBadRequest) + return + } + + k, objectSize, err := cluster.DecodeChunkKey(r.URL.Query()) + if err != nil { + http.Error(w, "invalid chunk key: "+err.Error(), http.StatusBadRequest) + return + } + + h.log.LogAttrs(r.Context(), slog.LevelDebug, "internal_fill_request", + intChunkAttrs(k), + slog.Int64("object_size", objectSize), + slog.String("remote", r.RemoteAddr), + ) + + if !h.cl.IsCoordinator(k) { + h.log.LogAttrs(r.Context(), slog.LevelDebug, "internal_fill_not_coordinator", + intChunkAttrs(k), + slog.String("remote", r.RemoteAddr), + ) + http.Error(w, `{"reason":"not_coordinator"}`, http.StatusConflict) + + return + } + + body, err := h.fc.FillForPeer(r.Context(), k, objectSize) + if err != nil { + h.log.LogAttrs(r.Context(), slog.LevelWarn, "internal fill failed", + intChunkAttrs(k), + slog.Any("err", err), + ) + http.Error(w, "fill failed", http.StatusBadGateway) + + return + } + defer body.Close() //nolint:errcheck // internal-fill body close best-effort + + // Set Content-Length so the requesting peer can validate the + // streamed body length and detect mid-stream truncation. If the + // expected length is zero (unknown objectSize or empty chunk) we + // omit Content-Length; the requester then falls back to + // connection-close framing without length validation. + expectedLen := k.ExpectedLen(objectSize) + if expectedLen > 0 { + w.Header().Set("Content-Length", strconv.FormatInt(expectedLen, 10)) + } + + w.Header().Set("Content-Type", "application/octet-stream") + w.WriteHeader(http.StatusOK) + + if _, copyErr := io.Copy(w, body); copyErr != nil { + h.log.LogAttrs(r.Context(), slog.LevelWarn, "internal fill copy failed", + intChunkAttrs(k), + slog.Any("err", copyErr), + ) + + return + } + + h.log.LogAttrs(r.Context(), slog.LevelDebug, "internal_fill_complete", + intChunkAttrs(k), + slog.Int64("bytes", expectedLen), + ) +} + +// intChunkAttrs renders the chunk's identifying tuple as a slog +// group attribute matching the cross-package 'chunk' taxonomy. +func intChunkAttrs(k chunk.Key) slog.Attr { + return slog.Group("chunk", + slog.String("origin_id", k.OriginID), + slog.String("bucket", k.Bucket), + slog.String("key", k.ObjectKey), + slog.Int64("index", k.Index), + ) +} diff --git a/internal/orca/server/server_test.go b/internal/orca/server/server_test.go new file mode 100644 index 00000000..b95ccb51 --- /dev/null +++ b/internal/orca/server/server_test.go @@ -0,0 +1,1389 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package server + +import ( + "bytes" + "context" + "encoding/xml" + "errors" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "strconv" + "strings" + "sync" + "testing" + "time" + + "github.com/Azure/unbounded/internal/orca/chunk" + "github.com/Azure/unbounded/internal/orca/cluster" + "github.com/Azure/unbounded/internal/orca/config" + "github.com/Azure/unbounded/internal/orca/origin" +) + +// fakeEdgeAPI satisfies edgeFetchAPI with canned responses for unit +// tests. Only the field for the call you want to mock needs to be +// set; an unset *Func panics if the test invokes the corresponding +// method. +type fakeEdgeAPI struct { + HeadObjectFunc func(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) + GetChunkFunc func(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) + OriginVal origin.Origin +} + +func (f *fakeEdgeAPI) HeadObject(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + return f.HeadObjectFunc(ctx, bucket, key) +} + +func (f *fakeEdgeAPI) GetChunk(ctx context.Context, k chunk.Key, objectSize int64) (io.ReadCloser, error) { + return f.GetChunkFunc(ctx, k, objectSize) +} + +func (f *fakeEdgeAPI) Origin() origin.Origin { return f.OriginVal } + +// fakeOrigin satisfies origin.Origin for handler tests. Only the +// fields used in the test need to be populated. +type fakeOrigin struct { + HeadFunc func(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) + GetRangeFunc func(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) + ListFunc func(ctx context.Context, bucket, prefix, marker string, max int) (origin.ListResult, error) +} + +func (f *fakeOrigin) Head(ctx context.Context, bucket, key string) (origin.ObjectInfo, error) { + return f.HeadFunc(ctx, bucket, key) +} + +func (f *fakeOrigin) GetRange(ctx context.Context, bucket, key, etag string, off, n int64) (io.ReadCloser, error) { + return f.GetRangeFunc(ctx, bucket, key, etag, off, n) +} + +func (f *fakeOrigin) List(ctx context.Context, bucket, prefix, marker string, max int) (origin.ListResult, error) { + return f.ListFunc(ctx, bucket, prefix, marker, max) +} + +// TestWriteOriginError covers all five branches of the error mapping. +// Previously only ErrNotFound was exercised (via integration test). +func TestWriteOriginError(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + wantStatus int + wantBody string + }{ + { + name: "not found", + err: origin.ErrNotFound, + wantStatus: http.StatusNotFound, + wantBody: "NoSuchKey", + }, + { + name: "auth", + err: origin.ErrAuth, + wantStatus: http.StatusBadGateway, + wantBody: "Unauthorized origin", + }, + { + name: "unsupported blob type", + err: &origin.UnsupportedBlobTypeError{ + Bucket: "ctr", + Key: "page-blob", + BlobType: "PageBlob", + }, + wantStatus: http.StatusBadGateway, + wantBody: "OriginUnsupported", + }, + { + name: "etag changed", + err: &origin.OriginETagChangedError{ + Bucket: "b", Key: "k", Want: "old", + }, + wantStatus: http.StatusBadGateway, + wantBody: "OriginETagChanged", + }, + { + name: "generic error", + err: errors.New("unexpected"), + wantStatus: http.StatusBadGateway, + wantBody: "OriginUnreachable", + }, + } + + h := &EdgeHandler{log: discardLogger()} + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rr := httptest.NewRecorder() + h.writeOriginError(rr, tt.err) + + if rr.Code != tt.wantStatus { + t.Errorf("status=%d want %d", rr.Code, tt.wantStatus) + } + + if !strings.Contains(rr.Body.String(), tt.wantBody) { + t.Errorf("body %q does not contain %q", rr.Body.String(), tt.wantBody) + } + }) + } +} + +// TestHandleHead covers metadata propagation and the not-found error +// path on HEAD requests. +func TestHandleHead(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + info origin.ObjectInfo + err error + wantStatus int + wantHdrs map[string]string + }{ + { + name: "normal blob", + info: origin.ObjectInfo{ + Size: 1024, + ETag: "abc123", + ContentType: "application/octet-stream", + }, + wantStatus: http.StatusOK, + wantHdrs: map[string]string{ + "Content-Length": "1024", + "ETag": `"abc123"`, + "Content-Type": "application/octet-stream", + }, + }, + { + name: "missing content type omits header", + info: origin.ObjectInfo{Size: 99, ETag: "x"}, + wantStatus: http.StatusOK, + wantHdrs: map[string]string{ + "Content-Length": "99", + "ETag": `"x"`, + }, + }, + { + name: "missing etag omits header", + info: origin.ObjectInfo{Size: 7}, + wantStatus: http.StatusOK, + wantHdrs: map[string]string{ + "Content-Length": "7", + }, + }, + { + name: "origin not found yields 404", + err: origin.ErrNotFound, + wantStatus: http.StatusNotFound, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return tt.info, tt.err + }, + } + h := NewEdgeHandler(fc, &config.Config{}, discardLogger()) + + req := httptest.NewRequest(http.MethodHead, "/bucket/key", nil) + rr := httptest.NewRecorder() + h.handleHead(rr, req, "bucket", "key") + + if rr.Code != tt.wantStatus { + t.Errorf("status=%d want %d", rr.Code, tt.wantStatus) + } + + for k, want := range tt.wantHdrs { + got := rr.Header().Get(k) + if got != want { + t.Errorf("header %s=%q want %q", k, got, want) + } + } + + if rr.Body.Len() != 0 && tt.wantStatus == http.StatusOK { + t.Errorf("HEAD body should be empty; got %d bytes", rr.Body.Len()) + } + }) + } +} + +// TestHandleList covers the XML pass-through, prefix propagation, +// truncation, and empty-list handling. +func TestHandleList(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + prefix string + listResult origin.ListResult + listErr error + wantStatus int + wantKeys []string + wantTrunc bool + wantNextTok string + }{ + { + name: "normal list", + prefix: "alpha/", + listResult: origin.ListResult{ + Entries: []origin.ObjectEntry{ + {Key: "alpha/one", Size: 3, ETag: "e1"}, + {Key: "alpha/two", Size: 5, ETag: "e2"}, + }, + }, + wantStatus: http.StatusOK, + wantKeys: []string{"alpha/one", "alpha/two"}, + }, + { + name: "empty list", + prefix: "missing/", + listResult: origin.ListResult{}, + wantStatus: http.StatusOK, + wantKeys: nil, + }, + { + name: "truncated list", + listResult: origin.ListResult{ + Entries: []origin.ObjectEntry{{Key: "k1"}}, + IsTruncated: true, + NextMarker: "next-page", + }, + wantStatus: http.StatusOK, + wantKeys: []string{"k1"}, + wantTrunc: true, + wantNextTok: "next-page", + }, + { + name: "origin error yields 502", + listErr: errors.New("upstream broken"), + wantStatus: http.StatusBadGateway, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + or := &fakeOrigin{ + ListFunc: func(_ context.Context, bucket, prefix, _ string, _ int) (origin.ListResult, error) { + if bucket != "b" { + t.Errorf("bucket=%q want %q", bucket, "b") + } + + if prefix != tt.prefix { + t.Errorf("prefix=%q want %q", prefix, tt.prefix) + } + + return tt.listResult, tt.listErr + }, + } + fc := &fakeEdgeAPI{OriginVal: or} + h := NewEdgeHandler(fc, &config.Config{}, discardLogger()) + + req := httptest.NewRequest(http.MethodGet, + "/b/?list-type=2&prefix="+tt.prefix, nil) + rr := httptest.NewRecorder() + h.handleList(rr, req, "b") + + if rr.Code != tt.wantStatus { + t.Errorf("status=%d want %d body=%s", rr.Code, tt.wantStatus, rr.Body.String()) + } + + if tt.wantStatus != http.StatusOK { + return + } + + var got struct { + XMLName xml.Name `xml:"ListBucketResult"` + Name string `xml:"Name"` + Prefix string `xml:"Prefix"` + KeyCount int `xml:"KeyCount"` + IsTruncated bool `xml:"IsTruncated"` + NextMarker string `xml:"NextContinuationToken"` + Contents []struct { + Key string `xml:"Key"` + } `xml:"Contents"` + } + if err := xml.Unmarshal(rr.Body.Bytes(), &got); err != nil { + t.Fatalf("xml decode: %v body=%s", err, rr.Body.String()) + } + + if got.Name != "b" { + t.Errorf("Name=%q want %q", got.Name, "b") + } + + if got.Prefix != tt.prefix { + t.Errorf("Prefix=%q want %q", got.Prefix, tt.prefix) + } + + if got.KeyCount != len(tt.wantKeys) { + t.Errorf("KeyCount=%d want %d", got.KeyCount, len(tt.wantKeys)) + } + + if got.IsTruncated != tt.wantTrunc { + t.Errorf("IsTruncated=%v want %v", got.IsTruncated, tt.wantTrunc) + } + + if got.NextMarker != tt.wantNextTok { + t.Errorf("NextMarker=%q want %q", got.NextMarker, tt.wantNextTok) + } + + gotKeys := make([]string, 0, len(got.Contents)) + for _, c := range got.Contents { + gotKeys = append(gotKeys, c.Key) + } + + if !equalStrings(gotKeys, tt.wantKeys) { + t.Errorf("keys=%v want %v", gotKeys, tt.wantKeys) + } + }) + } +} + +// TestParseSimpleByteRange covers all parser branches. +func TestParseSimpleByteRange(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + header string + size int64 + wantStart int64 + wantEnd int64 + wantOK bool + }{ + {"normal range", "bytes=0-99", 1024, 0, 99, true}, + {"suffix range", "bytes=-100", 1024, 924, 1023, true}, + {"open-ended", "bytes=100-", 1024, 100, 1023, true}, + {"end clamped to size", "bytes=0-9999", 1024, 0, 1023, true}, + {"start > end rejected", "bytes=100-50", 1024, 0, 0, false}, + {"missing prefix rejected", "0-99", 1024, 0, 0, false}, + {"multi-range rejected", "bytes=0-99,200-299", 1024, 0, 0, false}, + {"empty rejected", "", 1024, 0, 0, false}, + {"bytes= alone rejected", "bytes=", 1024, 0, 0, false}, + {"suffix larger than size rejected", "bytes=-9999", 1024, 0, 0, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s, e, ok := parseSimpleByteRange(tt.header, tt.size) + if ok != tt.wantOK { + t.Fatalf("ok=%v want %v (s=%d e=%d)", ok, tt.wantOK, s, e) + } + + if !ok { + return + } + + if s != tt.wantStart || e != tt.wantEnd { + t.Errorf("(s,e)=(%d,%d) want (%d,%d)", s, e, tt.wantStart, tt.wantEnd) + } + }) + } +} + +// TestSplitPath covers path splitting edge cases. +func TestSplitPath(t *testing.T) { + t.Parallel() + + tests := []struct { + in string + wantBucket string + wantKey string + }{ + {"", "", ""}, + {"/", "", ""}, + {"/bucket", "bucket", ""}, + {"/bucket/", "bucket", ""}, + {"/bucket/key", "bucket", "key"}, + {"/bucket/path/to/key", "bucket", "path/to/key"}, + } + + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + b, k := splitPath(tt.in) + if b != tt.wantBucket || k != tt.wantKey { + t.Errorf("splitPath(%q)=(%q,%q) want (%q,%q)", + tt.in, b, k, tt.wantBucket, tt.wantKey) + } + }) + } +} + +// TestSetObjectHeaders covers header propagation including the +// always-set Accept-Ranges and the conditionally-set fields. +func TestSetObjectHeaders(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + info origin.ObjectInfo + want map[string]string + }{ + { + name: "all fields set", + info: origin.ObjectInfo{ETag: "abc", ContentType: "text/plain"}, + want: map[string]string{ + "ETag": `"abc"`, + "Content-Type": "text/plain", + "Accept-Ranges": "bytes", + }, + }, + { + name: "missing content type", + info: origin.ObjectInfo{ETag: "abc"}, + want: map[string]string{ + "ETag": `"abc"`, + "Content-Type": "", + "Accept-Ranges": "bytes", + }, + }, + { + name: "missing etag", + info: origin.ObjectInfo{ContentType: "text/plain"}, + want: map[string]string{ + "ETag": "", + "Content-Type": "text/plain", + "Accept-Ranges": "bytes", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rr := httptest.NewRecorder() + setObjectHeaders(rr, tt.info) + + for k, want := range tt.want { + if got := rr.Header().Get(k); got != want { + t.Errorf("header %s=%q want %q", k, got, want) + } + } + }) + } +} + +// errReader is an io.ReadCloser whose first Read returns errFirst. +// Used to simulate cachestore-backed bodies that fail on their first +// network read (e.g. azureblob returning a 503 mid-stream after the +// header transaction succeeded). +type errReader struct { + errFirst error + closed bool +} + +func (r *errReader) Read(_ []byte) (int, error) { return 0, r.errFirst } +func (r *errReader) Close() error { r.closed = true; return nil } + +// TestHandleGet_EmptyObject_NoRange_Returns200 verifies that a GET +// against a zero-byte object responds with 200 + Content-Length: 0 +// and an empty body. Previously the handler computed rangeEnd = -1 +// and fell into the unsatisfiable-range branch, returning a spurious +// 416 for what should be a successful empty-body fetch. +func TestHandleGet_EmptyObject_NoRange_Returns200(t *testing.T) { + t.Parallel() + + info := origin.ObjectInfo{Size: 0, ETag: "etag-empty", ContentType: "application/octet-stream"} + + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return info, nil + }, + // GetChunkFunc deliberately unset; the short-circuit must + // not call into the fetch coordinator for zero-byte objects. + } + + cfg := &config.Config{Chunking: config.Chunking{Size: 1024}} + h := NewEdgeHandler(fc, cfg, discardLogger()) + + req := httptest.NewRequest(http.MethodGet, "/bucket/empty", nil) + rr := httptest.NewRecorder() + h.handleGet(rr, req, "bucket", "empty") + + if rr.Code != http.StatusOK { + t.Errorf("status=%d want %d", rr.Code, http.StatusOK) + } + + if rr.Body.Len() != 0 { + t.Errorf("body=%d bytes, want 0", rr.Body.Len()) + } + + if got := rr.Header().Get("Content-Length"); got != "0" { + t.Errorf("Content-Length=%q want %q", got, "0") + } +} + +// TestHandleGet_EmptyObject_WithRange_Returns416 verifies that a +// Range request against a zero-byte object remains a 416. RFC 7233 +// classifies any range over a zero-byte representation as +// unsatisfiable. +func TestHandleGet_EmptyObject_WithRange_Returns416(t *testing.T) { + t.Parallel() + + info := origin.ObjectInfo{Size: 0, ETag: "etag-empty"} + + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return info, nil + }, + } + + cfg := &config.Config{Chunking: config.Chunking{Size: 1024}} + h := NewEdgeHandler(fc, cfg, discardLogger()) + + req := httptest.NewRequest(http.MethodGet, "/bucket/empty", nil) + req.Header.Set("Range", "bytes=0-0") + + rr := httptest.NewRecorder() + h.handleGet(rr, req, "bucket", "empty") + + if rr.Code != http.StatusRequestedRangeNotSatisfiable { + t.Errorf("status=%d want %d", rr.Code, http.StatusRequestedRangeNotSatisfiable) + } +} + +// TestHandleGet_FirstChunkErrorReturnsCleanError verifies that when +// the very first chunk fetch fails the edge handler responds with an +// S3-style error response (proper status + error body) rather than +// committing a 200 status and then aborting the connection +// mid-stream. +// +// Regression test for B4. +func TestHandleGet_FirstChunkErrorReturnsCleanError(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + fetchErr error + peekErr error // non-nil means GetChunk succeeds but first Read fails + wantStatus int + wantBody string // substring assertion on the error body + }{ + { + name: "GetChunk returns NotFound", + fetchErr: origin.ErrNotFound, + wantStatus: http.StatusNotFound, + wantBody: "NoSuchKey", + }, + { + name: "GetChunk returns generic origin error", + fetchErr: errors.New("origin: connect: timeout"), + wantStatus: http.StatusBadGateway, + wantBody: "OriginUnreachable", + }, + { + name: "GetChunk succeeds but first Read fails", + peekErr: errors.New("cachestore: blob fetch 503"), + wantStatus: http.StatusBadGateway, + wantBody: "OriginUnreachable", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + info := origin.ObjectInfo{ + Size: 1024, + ETag: "etag1", + ContentType: "application/octet-stream", + } + + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return info, nil + }, + GetChunkFunc: func(_ context.Context, _ chunk.Key, _ int64) (io.ReadCloser, error) { + if tt.fetchErr != nil { + return nil, tt.fetchErr + } + + return &errReader{errFirst: tt.peekErr}, nil + }, + } + + cfg := &config.Config{Chunking: config.Chunking{Size: 1024}} + h := NewEdgeHandler(fc, cfg, discardLogger()) + + req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil) + rr := httptest.NewRecorder() + h.handleGet(rr, req, "bucket", "key") + + if rr.Code != tt.wantStatus { + t.Errorf("status=%d want %d; body=%q", rr.Code, tt.wantStatus, rr.Body.String()) + } + + if !strings.Contains(rr.Body.String(), tt.wantBody) { + t.Errorf("body=%q want substring %q", rr.Body.String(), tt.wantBody) + } + // A bug here would 200 first, then write nothing or + // partial bytes; verify the response did not commit a + // success status that contradicts the error. + if rr.Code == http.StatusOK { + t.Errorf("handler committed 200 before failure became known") + } + }) + } +} + +type fakeInternalFetchAPI struct { + body []byte +} + +func (f *fakeInternalFetchAPI) FillForPeer(_ context.Context, _ chunk.Key, _ int64) (io.ReadCloser, error) { + return io.NopCloser(strings.NewReader(string(f.body))), nil +} + +// singleSelfPeerSource produces a peer-set containing only self. +// IsCoordinator therefore returns true for every key, letting the +// internal-fill handler proceed past its coordinator check without +// requiring the test to know the rendezvous-hash outcome. +type singleSelfPeerSource struct{} + +func (singleSelfPeerSource) Peers(_ context.Context) ([]cluster.Peer, error) { + return []cluster.Peer{{IP: "10.0.0.1", Self: true}}, nil +} + +// TestInternalHandler_SetsContentLength verifies the internal-fill +// handler sets Content-Length to chunk.Key.ExpectedLen(objectSize) +// on the response. Setting the header allows the requesting peer to +// detect mid-stream truncation via net/http's standard io.ErrUnexpectedEOF +// surfacing; without it, a truncated peer response would be +// indistinguishable from a clean EOF. +// +// Regression test for B7. +func TestInternalHandler_SetsContentLength(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + chunkSize int64 + index int64 + objectSize int64 + wantLen string + }{ + { + name: "full chunk", + chunkSize: 1024, + index: 0, + objectSize: 4096, + wantLen: "1024", + }, + { + // The fake body returns chunkSize=1024 bytes but the + // tail-chunk ExpectedLen is 428 (3500 - 3*1024). The + // resulting Content-Length: 428 can only come from the + // handler computing ExpectedLen explicitly, proving the + // header is not auto-derived from the body length. + name: "tail chunk partial", + chunkSize: 1024, + index: 3, + objectSize: 3500, + wantLen: "428", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c, err := cluster.New(t.Context(), + config.Cluster{ + Service: "test", + SelfPodIP: "10.0.0.1", + MembershipRefresh: time.Hour, + InternalListen: "0.0.0.0:8444", + }, + cluster.WithPeerSource(singleSelfPeerSource{}), + ) + if err != nil { + t.Fatalf("cluster.New: %v", err) + } + + t.Cleanup(func() { _ = c.Close(context.Background()) }) + + h := NewInternalHandler(&fakeInternalFetchAPI{body: make([]byte, tt.chunkSize)}, c, discardLogger()) + + req := httptest.NewRequest(http.MethodGet, "/internal/fill?"+(func() string { + k := chunk.Key{ + OriginID: "origin", + Bucket: "bucket", + ObjectKey: "key", + ETag: "etag", + ChunkSize: tt.chunkSize, + Index: tt.index, + } + + return encodeQuery(k, tt.objectSize) + })(), nil) + req.Header.Set("X-Orca-Internal", "1") + + rr := httptest.NewRecorder() + h.ServeHTTP(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("status = %d want 200; body=%q", rr.Code, rr.Body.String()) + } + + got := rr.Header().Get("Content-Length") + if got != tt.wantLen { + t.Errorf("Content-Length = %q want %q", got, tt.wantLen) + } + }) + } +} + +// encodeQuery duplicates cluster.encodeChunkKey for test purposes +// (it is unexported in the cluster package). +func encodeQuery(k chunk.Key, objectSize int64) string { + return "origin_id=" + k.OriginID + + "&bucket=" + k.Bucket + + "&key=" + k.ObjectKey + + "&etag=" + k.ETag + + "&chunk_size=" + strconv.FormatInt(k.ChunkSize, 10) + + "&index=" + strconv.FormatInt(k.Index, 10) + + "&object_size=" + strconv.FormatInt(objectSize, 10) +} + +// helpers + +// TestEdgeHandler_DebugEmissions verifies that the edge handler +// emits a debug-level 'edge_request' trace at entry and at least +// one of the response-shape emissions for HEAD/GET. Operators rely +// on these to trace a single request across the structured-log +// output. +func TestEdgeHandler_DebugEmissions(t *testing.T) { + t.Parallel() + + info := origin.ObjectInfo{Size: 5, ETag: "etag-xyz", ContentType: "application/octet-stream"} + + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return info, nil + }, + } + + var buf bytes.Buffer + + cfg := &config.Config{Chunking: config.Chunking{Size: 1024}} + h := NewEdgeHandler(fc, cfg, debugLoggerTo(&buf)) + + req := httptest.NewRequest(http.MethodHead, "/bkt/obj", nil) + rr := httptest.NewRecorder() + h.ServeHTTP(rr, req) + + out := buf.String() + for _, want := range []string{"edge_request", "edge_head_response", "bucket=bkt", "key=obj"} { + if !strings.Contains(out, want) { + t.Errorf("expected %q in debug output; got %q", want, out) + } + } +} + +func discardLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +// debugLoggerTo returns a slog.Logger that writes Debug-and-above +// emissions to buf. Used by tests asserting debug-trace emission +// at known call sites. +func debugLoggerTo(buf *bytes.Buffer) *slog.Logger { + return slog.New(slog.NewTextHandler(buf, &slog.HandlerOptions{Level: slog.LevelDebug})) +} + +func equalStrings(a, b []string) bool { + if len(a) != len(b) { + return false + } + + for i := range a { + if a[i] != b[i] { + return false + } + } + + return true +} + +// readaheadConfig returns a config tailored for readahead unit tests. +// Origin.ID is required by the chunk-key construction inside +// handleGet; chunk size and readahead are explicit so each test +// controls them independently. +func readaheadConfig(chunkSize int64, readahead int) *config.Config { + r := readahead + + return &config.Config{ + Origin: config.Origin{ID: "origin"}, + Chunking: config.Chunking{ + Size: chunkSize, + Readahead: &r, + }, + } +} + +// makeChunkData returns a chunkSize-byte payload whose contents +// encode the chunk index so test assertions can verify that the +// streamed body delivers chunks in correct order. Each byte at +// offset b within chunk i is `byte((int(i) + b) % 251)`; using a +// prime modulus avoids spurious alignment on power-of-two +// boundaries. +func makeChunkData(idx int64, n int) []byte { + out := make([]byte, n) + for b := 0; b < n; b++ { + out[b] = byte((int(idx) + b) % 251) + } + + return out +} + +// trackedReadCloser is an io.ReadCloser that records Close() calls +// for the readahead-cancellation test. closedCh fires once on the +// first Close(). +type trackedReadCloser struct { + io.Reader + closed bool + closedCh chan struct{} +} + +func (t *trackedReadCloser) Close() error { + if !t.closed { + t.closed = true + close(t.closedCh) + } + + return nil +} + +// TestHandleGet_DynamicChunkSize_SmallObject verifies a small object +// (well below any tier threshold) uses the base Chunking.Size. The +// fake fetch records the chunk-key sizes seen so we can assert the +// edge handler is not regressing to the previous global-only chunk +// size on the small-object path. +func TestHandleGet_DynamicChunkSize_SmallObject(t *testing.T) { + t.Parallel() + + info := origin.ObjectInfo{Size: 100 * (1 << 20), ETag: "etag", ContentType: "application/octet-stream"} + + var ( + mu sync.Mutex + seenSizes []int64 + ) + + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return info, nil + }, + GetChunkFunc: func(_ context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) { + mu.Lock() + + seenSizes = append(seenSizes, k.ChunkSize) + mu.Unlock() + + return io.NopCloser(bytes.NewReader(makeChunkData(k.Index, int(k.ExpectedLen(info.Size))))), nil + }, + } + + cfg := &config.Config{ + Origin: config.Origin{ID: "origin"}, + Chunking: config.Chunking{ + Size: 8 << 20, + Tiers: []config.ChunkTier{ + {MinObjectSize: 1 << 30, ChunkSize: 64 << 20}, + }, + }, + } + + h := NewEdgeHandler(fc, cfg, discardLogger()) + + req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil) + rr := httptest.NewRecorder() + h.handleGet(rr, req, "bucket", "key") + + if rr.Code != http.StatusOK { + t.Fatalf("status=%d want 200; body=%q", rr.Code, rr.Body.String()) + } + + mu.Lock() + defer mu.Unlock() + + if len(seenSizes) == 0 { + t.Fatalf("no chunk fetches recorded") + } + + for i, sz := range seenSizes { + if sz != 8<<20 { + t.Errorf("seenSizes[%d]=%d want 8 MiB (base)", i, sz) + } + } +} + +// TestHandleGet_DynamicChunkSize_LargeObject verifies a large object +// (above the tier threshold) uses the tier's ChunkSize and that the +// number of chunks fetched matches the larger granularity (fewer +// requests). +func TestHandleGet_DynamicChunkSize_LargeObject(t *testing.T) { + t.Parallel() + + // 700 GiB synthetic object; chunked at the 128 MiB tier this is + // 5600 chunks. We don't fetch them all in this test (we set up a + // fake that streams a tiny payload per chunk request), but we do + // confirm the chunk keys carry ChunkSize=128 MiB and the + // first-chunk path lands on Index=0. + const ( + large = int64(700) * (1 << 30) // 700 GiB + tierSz = int64(128) << 20 // 128 MiB + baseSz = int64(8) << 20 // 8 MiB + ) + + info := origin.ObjectInfo{Size: large, ETag: "etag", ContentType: "application/octet-stream"} + + // To keep the test fast we use a Range request covering exactly + // the first chunk; otherwise the handler would attempt to stream + // 700 GiB. Range bytes=0-(tierSz-1) targets chunk 0 only. + var ( + mu sync.Mutex + seenSizes []int64 + seenIdx []int64 + ) + + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return info, nil + }, + GetChunkFunc: func(_ context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) { + mu.Lock() + + seenSizes = append(seenSizes, k.ChunkSize) + seenIdx = append(seenIdx, k.Index) + mu.Unlock() + + return io.NopCloser(bytes.NewReader(makeChunkData(k.Index, int(k.ExpectedLen(info.Size))))), nil + }, + } + + cfg := &config.Config{ + Origin: config.Origin{ID: "origin"}, + Chunking: config.Chunking{ + Size: baseSz, + Tiers: []config.ChunkTier{ + {MinObjectSize: 10 * (1 << 30), ChunkSize: tierSz}, + }, + }, + } + + h := NewEdgeHandler(fc, cfg, discardLogger()) + + req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil) + req.Header.Set("Range", "bytes=0-"+strconv.FormatInt(tierSz-1, 10)) + + rr := httptest.NewRecorder() + h.handleGet(rr, req, "bucket", "key") + + if rr.Code != http.StatusPartialContent { + t.Fatalf("status=%d want 206; body=%q", rr.Code, rr.Body.String()) + } + + mu.Lock() + defer mu.Unlock() + + if len(seenSizes) != 1 { + t.Fatalf("expected exactly 1 chunk fetch for first-chunk range; got %d", len(seenSizes)) + } + + if seenSizes[0] != tierSz { + t.Errorf("seenSizes[0]=%d want %d (tier size)", seenSizes[0], tierSz) + } + + if seenIdx[0] != 0 { + t.Errorf("seenIdx[0]=%d want 0", seenIdx[0]) + } +} + +// TestHandleGet_Readahead_DisabledZero verifies that Readahead=0 +// preserves the strictly-sequential behavior: GetChunk is called +// one chunk at a time, in order, with no concurrent fetches in +// flight. The fake fetch deliberately reports concurrent calls so a +// regression that started the prefetcher despite depth=0 would be +// caught. +func TestHandleGet_Readahead_DisabledZero(t *testing.T) { + t.Parallel() + + const ( + chunkSize = int64(1024) + nChunks = int64(5) + objectSize = chunkSize * nChunks + ) + + info := origin.ObjectInfo{Size: objectSize, ETag: "e", ContentType: "application/octet-stream"} + + var ( + mu sync.Mutex + inFlight int + maxInFlt int + callOrder []int64 + ) + + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return info, nil + }, + GetChunkFunc: func(_ context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) { + mu.Lock() + inFlight++ + + if inFlight > maxInFlt { + maxInFlt = inFlight + } + + callOrder = append(callOrder, k.Index) + mu.Unlock() + // Brief sleep to widen any concurrency window. + time.Sleep(5 * time.Millisecond) + + mu.Lock() + inFlight-- + mu.Unlock() + + return io.NopCloser(bytes.NewReader(makeChunkData(k.Index, int(chunkSize)))), nil + }, + } + + cfg := readaheadConfig(chunkSize, 0) + h := NewEdgeHandler(fc, cfg, discardLogger()) + + req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil) + rr := httptest.NewRecorder() + h.handleGet(rr, req, "bucket", "key") + + if rr.Code != http.StatusOK { + t.Fatalf("status=%d want 200; body=%q", rr.Code, rr.Body.String()) + } + + if int64(rr.Body.Len()) != objectSize { + t.Errorf("body=%d bytes, want %d", rr.Body.Len(), objectSize) + } + + mu.Lock() + defer mu.Unlock() + + if maxInFlt != 1 { + t.Errorf("max in-flight=%d want 1 (no readahead)", maxInFlt) + } + + for i, idx := range callOrder { + if idx != int64(i) { + t.Errorf("callOrder[%d]=%d want %d (in-order serial fetch)", i, idx, i) + } + } +} + +// TestHandleGet_Readahead_ParallelHidesLatency verifies that with +// Readahead > 0 the handler can have multiple chunk fetches in +// flight concurrently. The fake fetch sleeps long enough per chunk +// that the wall-clock time for the full GET should be substantially +// less than nChunks * perChunkDelay if readahead is working. +func TestHandleGet_Readahead_ParallelHidesLatency(t *testing.T) { + t.Parallel() + + const ( + chunkSize = int64(1024) + nChunks = int64(5) + objectSize = chunkSize * nChunks + perChunkLat = 40 * time.Millisecond + readahead = 4 + ) + + info := origin.ObjectInfo{Size: objectSize, ETag: "e", ContentType: "application/octet-stream"} + + var ( + mu sync.Mutex + inFlight int + maxInFlt int + ) + + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return info, nil + }, + GetChunkFunc: func(ctx context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) { + mu.Lock() + inFlight++ + + if inFlight > maxInFlt { + maxInFlt = inFlight + } + mu.Unlock() + + select { + case <-time.After(perChunkLat): + case <-ctx.Done(): + mu.Lock() + inFlight-- + mu.Unlock() + + return nil, ctx.Err() + } + + mu.Lock() + inFlight-- + mu.Unlock() + + return io.NopCloser(bytes.NewReader(makeChunkData(k.Index, int(chunkSize)))), nil + }, + } + + cfg := readaheadConfig(chunkSize, readahead) + h := NewEdgeHandler(fc, cfg, discardLogger()) + + req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil) + rr := httptest.NewRecorder() + + start := time.Now() + + h.handleGet(rr, req, "bucket", "key") + + elapsed := time.Since(start) + + if rr.Code != http.StatusOK { + t.Fatalf("status=%d want 200; body=%q", rr.Code, rr.Body.String()) + } + + if int64(rr.Body.Len()) != objectSize { + t.Errorf("body=%d bytes, want %d", rr.Body.Len(), objectSize) + } + + // Strict serial baseline = nChunks * perChunkLat. With readahead + // we expect substantially less; we conservatively assert < + // (nChunks * perChunkLat * 0.8) which gives the test plenty of + // CI slack. The exact speedup depends on scheduler timing; the + // in-flight max metric below is the deterministic assertion. + serialBaseline := time.Duration(nChunks) * perChunkLat + + if elapsed >= serialBaseline { + t.Errorf("readahead did not hide latency: elapsed=%v, serial baseline=%v", + elapsed, serialBaseline) + } + + mu.Lock() + defer mu.Unlock() + + if maxInFlt < 2 { + t.Errorf("max in-flight=%d want >= 2 (readahead concurrent)", maxInFlt) + } +} + +// TestHandleGet_Readahead_CancellationClosesBodies verifies that +// when the streaming consumer aborts mid-response (e.g. a downstream +// write fails), every prefetched body still buffered in the +// readahead channel is Close()d on the way out. Without this the +// cachestore would leak HTTP response bodies whenever a client +// disconnects partway through a large blob. +// +// Setup: the handler streams to an http.ResponseWriter wrapped to +// return an io.ErrShortWrite after a fixed byte count, forcing the +// streamSlice call to abort mid-chunk. We then assert that every +// trackedReadCloser handed out has had Close() called. +func TestHandleGet_Readahead_CancellationClosesBodies(t *testing.T) { + t.Parallel() + + const ( + chunkSize = int64(256) + nChunks = int64(8) + objectSize = chunkSize * nChunks + readahead = 4 + ) + + info := origin.ObjectInfo{Size: objectSize, ETag: "e", ContentType: "application/octet-stream"} + + var ( + mu sync.Mutex + bodies []*trackedReadCloser + ) + + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return info, nil + }, + GetChunkFunc: func(_ context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) { + b := &trackedReadCloser{ + Reader: bytes.NewReader(makeChunkData(k.Index, int(chunkSize))), + closedCh: make(chan struct{}), + } + + mu.Lock() + + bodies = append(bodies, b) + mu.Unlock() + + return b, nil + }, + } + + cfg := readaheadConfig(chunkSize, readahead) + h := NewEdgeHandler(fc, cfg, discardLogger()) + + // shortWriter writes the first maxBytes bytes to inner and + // returns io.ErrShortWrite on any further write. Reproduces a + // client connection that closes mid-stream. + rr := httptest.NewRecorder() + w := &shortWriter{inner: rr, maxBytes: int(chunkSize) + int(chunkSize)/2} // 1.5 chunks + + req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil) + h.handleGet(w, req, "bucket", "key") + + // All bodies handed out should be closed; allow a brief window + // for the producer goroutine to observe ctx-cancellation and + // close its in-flight body via the select branch. + deadline := time.After(2 * time.Second) + + for i := 0; ; i++ { + mu.Lock() + allClosed := true + + for _, b := range bodies { + if !b.closed { + allClosed = false + break + } + } + + count := len(bodies) + mu.Unlock() + + if allClosed && count > 1 { + // Multiple bodies were handed out and all are closed. + return + } + + select { + case <-deadline: + mu.Lock() + defer mu.Unlock() + + if count <= 1 { + t.Fatalf("only %d bodies handed out; readahead did not engage", count) + } + + for j, b := range bodies { + if !b.closed { + t.Errorf("body[%d] (chunk index %d) not closed", j, j) + } + } + + return + default: + time.Sleep(10 * time.Millisecond) + } + + _ = i + } +} + +// TestHandleGet_Readahead_ProducerPanicRecovered verifies that a +// panic inside the readahead producer goroutine is recovered, logged, +// and does not deadlock the consumer or crash the process. The +// consumer should see an early channel close and treat the response +// as a mid-stream abort. +func TestHandleGet_Readahead_ProducerPanicRecovered(t *testing.T) { + t.Parallel() + + const ( + chunkSize = int64(256) + nChunks = int64(6) + objectSize = chunkSize * nChunks + readahead = 2 + ) + + info := origin.ObjectInfo{Size: objectSize, ETag: "e", ContentType: "application/octet-stream"} + + var ( + mu sync.Mutex + calls int64 + panicAt = int64(3) // panic on the 3rd GetChunk + ) + + fc := &fakeEdgeAPI{ + HeadObjectFunc: func(_ context.Context, _, _ string) (origin.ObjectInfo, error) { + return info, nil + }, + GetChunkFunc: func(_ context.Context, k chunk.Key, _ int64) (io.ReadCloser, error) { + mu.Lock() + calls++ + n := calls + mu.Unlock() + + if n == panicAt { + panic("readahead test: synthetic producer panic") + } + + return io.NopCloser(bytes.NewReader(makeChunkData(k.Index, int(chunkSize)))), nil + }, + } + + var logBuf bytes.Buffer + + cfg := readaheadConfig(chunkSize, readahead) + h := NewEdgeHandler(fc, cfg, debugLoggerTo(&logBuf)) + + req := httptest.NewRequest(http.MethodGet, "/bucket/key", nil) + rr := httptest.NewRecorder() + + done := make(chan struct{}) + + go func() { + defer close(done) + + h.handleGet(rr, req, "bucket", "key") + }() + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatalf("handler deadlocked after producer panic") + } + + // The first chunk was peeked and streamed successfully (a + // committed 200 response). Subsequent panic is a mid-stream + // abort; the response code is therefore 200 even though the + // body is truncated. + if rr.Code != http.StatusOK { + t.Errorf("status=%d want 200 (panic is mid-stream)", rr.Code) + } + + out := logBuf.String() + if !strings.Contains(out, "readahead worker panic") { + t.Errorf("missing 'readahead worker panic' in log; got %q", out) + } +} + +// shortWriter writes the first maxBytes bytes to inner then returns +// io.ErrShortWrite on any subsequent Write. Used to simulate a +// client connection that drops mid-response. +type shortWriter struct { + inner http.ResponseWriter + written int + maxBytes int +} + +func (s *shortWriter) Header() http.Header { return s.inner.Header() } + +func (s *shortWriter) WriteHeader(code int) { s.inner.WriteHeader(code) } + +func (s *shortWriter) Write(p []byte) (int, error) { + if s.written >= s.maxBytes { + return 0, io.ErrShortWrite + } + + remaining := s.maxBytes - s.written + if len(p) > remaining { + // Write exactly up to the cap, then fail any further calls. + n, _ := s.inner.Write(p[:remaining]) + s.written += n + + return n, io.ErrShortWrite + } + + n, err := s.inner.Write(p) + s.written += n + + return n, err +}