From 8ab6e6e719c2e9591acd3b39bd7e282663e545df Mon Sep 17 00:00:00 2001
From: prismn <prismn@users.noreply.github.com>
Date: Mon, 22 Jun 2026 16:00:45 +0100
Subject: [PATCH] feat: add storage tiering automation

---
 Backend/src/gists/dto/query-gists.dto.ts |  13 +-
 Backend/src/gists/gist.repository.ts     |  22 +-
 Backend/src/gists/gists.controller.ts    |   3 -
 Backend/src/gists/gists.service.ts       |  37 +--
 infrastructure/docs/storage-tiering.md   |  76 ++++++
 infrastructure/scripts/analyze-access.sh | 305 +++++++++++++++++++++++
 infrastructure/terraform/s3-tiering.tf   |  41 +++
 7 files changed, 438 insertions(+), 59 deletions(-)
 create mode 100644 infrastructure/docs/storage-tiering.md
 create mode 100755 infrastructure/scripts/analyze-access.sh
 create mode 100644 infrastructure/terraform/s3-tiering.tf
diff --git a/Backend/src/gists/dto/query-gists.dto.ts b/Backend/src/gists/dto/query-gists.dto.ts
index 3b4a029b..7386f12c 100644
--- a/Backend/src/gists/dto/query-gists.dto.ts
+++ b/Backend/src/gists/dto/query-gists.dto.ts
@@ -1,17 +1,16 @@
 import { ApiProperty, ApiPropertyOptional } from '@nestjs/swagger';
+import { Transform, Type } from 'class-transformer';
 import {
+  IsBoolean,
   IsLatitude,
   IsLongitude,
-  IsOptional,
   IsNumber,
-  Min,
-  Max,
+  IsOptional,
   IsString,
+  Max,
   MaxLength,
-  IsBoolean,
+  Min,
 } from 'class-validator';
-import { IsLatitude, IsLongitude, IsOptional, IsNumber, Min, Max, IsString, IsBoolean, MaxLength } from 'class-validator';
-import { Type, Transform } from 'class-transformer';
 
 export class QueryGistsDto {
   @ApiProperty({ description: 'Latitude to search from', example: 9.0579 })
@@ -63,8 +62,6 @@ export class QueryGistsDto {
   authorAddress?: string;
 
   @ApiPropertyOptional({
-    description: 'Return count grouped by location_cell for heatmap data',
-    example: true,
     description: 'When true, count endpoint returns breakdown by location_cell',
     example: false,
   })
diff --git a/Backend/src/gists/gist.repository.ts b/Backend/src/gists/gist.repository.ts
index 3a630f06..299d2d9f 100644
--- a/Backend/src/gists/gist.repository.ts
+++ b/Backend/src/gists/gist.repository.ts
@@ -12,7 +12,7 @@ export interface NearbyQuery {
   lon: number;
   radiusMeters?: number;
   limit?: number;
-  cursor?: string; // base64 encoded cursor or raw ISO date string
+  cursor?: string;
   authorAddress?: string;
 }
 
@@ -32,13 +32,6 @@ export interface CreateGistData {
 export class GistRepository {
   constructor(@InjectDataSource() private readonly dataSource: DataSource) {}
 
-  /**
-   * Persist a new gist row. When a transactional `EntityManager` is supplied
-   * (e.g. from `GistsService.create`), the INSERT joins the caller's
-   * transaction so the write can be rolled back atomically. When no manager
-   * is provided (e.g. from `IndexerService` on the connection pool), the
-   * INSERT runs in its own implicit transaction.
-   */
   async create(data: CreateGistData, manager?: EntityManager): Promise<Gist> {
     const {
       content,
@@ -52,9 +45,7 @@ export class GistRepository {
       expires_at,
     } = data;
 
-    // Default expiry: 24 hours from now
     const expiresAt = expires_at ?? new Date(Date.now() + 24 * 60 * 60 * 1000);
-
     const queryRunner = manager ?? this.dataSource;
 
     const result = await queryRunner.query<Gist[]>(
@@ -86,18 +77,15 @@ export class GistRepository {
     const params: unknown[] = [lon, lat, radiusMeters, limit];
     const clauses: string[] = [];
 
-    // Issue #604 — exclude expired gists
     clauses.push(`g.expires_at > NOW()`);
 
     if (cursor) {
-      // Support both base64 encoded cursors and raw ISO strings
       const decoded = PaginationHelper.decodeCursor(cursor) ?? cursor;
       params.push(decoded);
       clauses.push(`g.created_at < $${params.length}`);
     }
 
     if (authorAddress) {
-      // Case-sensitive exact match — Stellar addresses are encoded payloads
       params.push(authorAddress);
       clauses.push(`g.author_address = $${params.length}`);
     }
@@ -129,7 +117,7 @@ export class GistRepository {
         $3
       )
       ${extraWhere}
-      ORDER BY g.created_at DESC
+      ORDER BY distance_meters ASC, g.created_at DESC
       LIMIT $4
       `,
       params,
@@ -181,7 +169,6 @@ export class GistRepository {
     return parseInt(row.cnt, 10) > 0;
   }
 
-  /** Issue #604 — delete rows whose expiry has passed (called by cron job). */
   async deleteExpired(): Promise<number> {
     const result = await this.dataSource.query<Array<{ count: string }>>(
       `WITH deleted AS (DELETE FROM gists WHERE expires_at <= NOW() RETURNING id)
@@ -189,6 +176,7 @@ export class GistRepository {
     );
     return parseInt(result[0].count, 10);
   }
+
   async countNearby(lat: number, lon: number, radiusMeters: number): Promise<number> {
     const [row] = await this.dataSource.query<Array<{ count: string }>>(
       `SELECT COUNT(*) AS count FROM gists
@@ -206,10 +194,6 @@ export class GistRepository {
     query: Pick<NearbyQuery, 'lat' | 'lon' | 'radiusMeters'>,
   ): Promise<Array<{ cell: string; count: number }>> {
     const { lat, lon, radiusMeters = 500 } = query;
-    lat: number,
-    lon: number,
-    radiusMeters: number,
-  ): Promise<Array<{ cell: string; count: number }>> {
     const rows = await this.dataSource.query<Array<{ location_cell: string; count: string }>>(
       `SELECT location_cell, COUNT(*) AS count FROM gists
        WHERE ST_DWithin(
diff --git a/Backend/src/gists/gists.controller.ts b/Backend/src/gists/gists.controller.ts
index 535bcb7a..554e5a47 100644
--- a/Backend/src/gists/gists.controller.ts
+++ b/Backend/src/gists/gists.controller.ts
@@ -37,9 +37,6 @@ export class GistsController {
   @Get('count')
   @SkipThrottle()
   @ApiOperation({ summary: 'Count gists near a location (optionally broken down by cell)' })
-  @Get('count')
-  @SkipThrottle()
-  @ApiOperation({ summary: 'Count active gists within a radius' })
   countNearby(@Query() query: QueryGistsDto) {
     return this.gistsService.countNearby(query);
   }
diff --git a/Backend/src/gists/gists.service.ts b/Backend/src/gists/gists.service.ts
index ee9ebd24..763659b3 100644
--- a/Backend/src/gists/gists.service.ts
+++ b/Backend/src/gists/gists.service.ts
@@ -13,6 +13,14 @@ import { stripHtml } from '../common/utils/sanitize';
 
 const DEFAULT_TTL_HOURS = 24;
 
+export interface CountNearbyResult {
+  count: number;
+  radius: number;
+  lat: number;
+  lon: number;
+  breakdown?: Array<{ cell: string; count: number }>;
+}
+
 @Injectable()
 export class GistsService {
   private readonly logger = new Logger(GistsService.name);
@@ -25,24 +33,8 @@ export class GistsService {
     private readonly sorobanService: SorobanService,
   ) {}
 
-  /**
-   * Create a gist end-to-end.
-   *
-   * Issue #98 — atomicity:
-   *   - External side-effects (sanitize, geo-encode, IPFS pin, Soroban post)
-   *     happen OUTSIDE the database transaction because they cannot be
-   *     rolled back from Postgres and would just block a connection slot.
-   *   - The actual database INSERT runs inside `dataSource.transaction()` so
-   *     any error thrown during the write rolls back the row atomically and
-   *     future related writes (audit log, related tables) join the same tx.
-   *   - A duplicate `stellar_gist_id` (e.g. retried Soroban post) raises a
-   *     Postgres unique-violation (SQLSTATE 23505); we catch it and return
-   *     the existing row so the API becomes safely idempotent.
-   */
   async create(dto: CreateGistDto): Promise<Gist> {
-    // Issue 87 — sanitize content before storing
     const content = stripHtml(dto.content);
-
     const locationCell = this.geoService.encode(dto.lat, dto.lon);
 
     const { cid } = await this.ipfsService.pinJson({
@@ -57,7 +49,6 @@ export class GistsService {
 
     this.logger.log(`Gist posted → cell=${locationCell} cid=${cid} gistId=${gistId}`);
 
-    // Issue #604 — compute expiry from ttlHours (default 24 h)
     const ttlHours = dto.ttlHours ?? DEFAULT_TTL_HOURS;
     const expiresAt = new Date(Date.now() + ttlHours * 60 * 60 * 1000);
 
@@ -81,9 +72,6 @@ export class GistsService {
     } catch (err) {
       const code = (err as { code?: string })?.code;
       if (code === PG_UNIQUE_VIOLATION) {
-        // Concurrent or retried create with the same on-chain gist ID — the
-        // winning transaction already persisted the row. Return it so the
-        // caller observes a logically idempotent success.
         this.logger.debug(
           `Gist ${gistId} already indexed — returning existing row (SQLSTATE ${PG_UNIQUE_VIOLATION})`,
         );
@@ -122,16 +110,7 @@ export class GistsService {
       return { count: total, radius, lat, lon, breakdown: rows };
     }
 
-    const count = await this.gistRepository.countNearby({ lat, lon, radiusMeters: radius });
-  async countNearby(
-    query: QueryGistsDto,
-  ): Promise<{ count: number; radius: number; lat: number; lon: number; breakdown?: Array<{ cell: string; count: number }> }> {
-    const { lat, lon, radius = 500, breakdown } = query;
     const count = await this.gistRepository.countNearby(lat, lon, radius);
-    if (breakdown) {
-      const cells = await this.gistRepository.countNearbyByCell(lat, lon, radius);
-      return { count, radius, lat, lon, breakdown: cells };
-    }
     return { count, radius, lat, lon };
   }
 }
diff --git a/infrastructure/docs/storage-tiering.md b/infrastructure/docs/storage-tiering.md
new file mode 100644
index 00000000..f8a8ea0f
--- /dev/null
+++ b/infrastructure/docs/storage-tiering.md
@@ -0,0 +1,76 @@
+# Storage Tiering Automation
+
+This repository now has a small storage-tiering workflow for long-lived S3 data:
+
+- Terraform lifecycle rules for the backup bucket
+- An access-analysis script that scores objects by recency and request volume
+- Retrieval commands for archived objects
+- A JSON report that can be used for cost review
+
+## Terraform
+
+The backup bucket is configured to transition through cheaper tiers over time:
+
+- `STANDARD_IA` after 30 days
+- `GLACIER_IR` after 90 days
+- `DEEP_ARCHIVE` after 180 days
+- noncurrent versions are expired after 365 days
+
+See [`infrastructure/terraform/s3-tiering.tf`](../terraform/s3-tiering.tf) and the existing bucket definitions in [`infrastructure/terraform/s3-buckets.tf`](../terraform/s3-buckets.tf).
+
+## Access Analysis
+
+Use [`infrastructure/scripts/analyze-access.sh`](../scripts/analyze-access.sh) to turn an access inventory into a recommendation report.
+
+The script expects JSON like:
+
+```json
+[
+  {
+    "key": "backups/2026-06-01.sql.gz",
+    "bucket": "gistpin-backups",
+    "storage_class": "STANDARD",
+    "size_bytes": 12345,
+    "last_accessed_days": 42,
+    "request_count_30d": 8
+  }
+]
+```
+
+Run it with a file or by piping JSON on stdin:
+
+```bash
+bash infrastructure/scripts/analyze-access.sh \
+  --input /tmp/s3-access-inventory.json \
+  --bucket gistpin-backups
+```
+
+The script writes a report to `infrastructure/ci/reports/storage-tiering/` by default. The report includes:
+
+- object counts per tier
+- normalized storage-cost estimates
+- objects that should move to a cheaper class
+- restore commands for archived objects
+- a lifecycle recommendation block for the backup bucket
+
+## Retrieval
+
+For objects recommended for archive storage, the report includes restore commands using `aws s3api restore-object`.
+
+Example:
+
+```bash
+aws s3api restore-object \
+  --bucket gistpin-backups \
+  --key backups/2026-06-01.sql.gz \
+  --restore-request '{"Days":7,"GlacierJobParameters":{"Tier":"Standard"}}'
+```
+
+## Cost Review
+
+Treat the generated report as an operational cost review artifact:
+
+1. Run the access analysis on the latest inventory export.
+2. Review objects recommended for tier changes.
+3. Apply the lifecycle rules or restore commands as needed.
+4. Re-run the report to confirm the savings trend.
diff --git a/infrastructure/scripts/analyze-access.sh b/infrastructure/scripts/analyze-access.sh
new file mode 100755
index 00000000..dd9f57a1
--- /dev/null
+++ b/infrastructure/scripts/analyze-access.sh
@@ -0,0 +1,305 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+cd "${REPO_ROOT}"
+
+INPUT_FILE=""
+OUTPUT_FILE="${OUTPUT_FILE:-}"
+BUCKET_NAME="${BUCKET_NAME:-gistpin-backups}"
+RESTORE_KEY="${RESTORE_KEY:-}"
+REPORT_DIR="${REPORT_DIR:-infrastructure/ci/reports/storage-tiering}"
+
+usage() {
+  cat <<'EOF'
+Usage: analyze-access.sh [--input FILE] [--output FILE] [--bucket NAME] [--restore-key KEY]
+
+Reads a JSON inventory of S3 objects and produces a storage-tiering report.
+If no input is provided, the script writes an empty report and exits cleanly.
+
+Expected JSON shape:
+[
+  {
+    "key": "backups/2026-06-01.sql.gz",
+    "bucket": "gistpin-backups",
+    "storage_class": "STANDARD",
+    "size_bytes": 12345,
+    "last_accessed_days": 42,
+    "request_count_30d": 8
+  }
+]
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --input)
+      INPUT_FILE="${2:-}"
+      shift 2
+      ;;
+    --output)
+      OUTPUT_FILE="${2:-}"
+      shift 2
+      ;;
+    --bucket)
+      BUCKET_NAME="${2:-}"
+      shift 2
+      ;;
+    --restore-key)
+      RESTORE_KEY="${2:-}"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 1
+      ;;
+  esac
+done
+
+if [[ -z "${OUTPUT_FILE}" ]]; then
+  TIMESTAMP="$(date -u +%Y%m%dT%H%M%SZ)"
+  OUTPUT_FILE="${REPORT_DIR}/storage-tiering-${TIMESTAMP}.json"
+fi
+
+mkdir -p "$(dirname "${OUTPUT_FILE}")"
+
+if ! command -v python3 >/dev/null 2>&1; then
+  echo "python3 is required to run access analysis." >&2
+  exit 1
+fi
+
+python3 - "${INPUT_FILE}" "${OUTPUT_FILE}" "${BUCKET_NAME}" "${RESTORE_KEY}" <<'PY'
+from __future__ import annotations
+
+import json
+import pathlib
+import shlex
+import sys
+from datetime import datetime, timezone
+
+input_file, output_file, default_bucket, restore_key = sys.argv[1:5]
+
+WEIGHTS = {
+    "STANDARD": 1.0,
+    "STANDARD_IA": 0.7,
+    "GLACIER_IR": 0.25,
+    "DEEP_ARCHIVE": 0.05,
+}
+
+DEFAULT_LIFECYCLE_RECOMMENDATION = [
+    {"days": 30, "storage_class": "STANDARD_IA"},
+    {"days": 90, "storage_class": "GLACIER_IR"},
+    {"days": 180, "storage_class": "DEEP_ARCHIVE"},
+]
+
+
+def to_int(value: object, default: int = 0) -> int:
+    if value in (None, ""):
+        return default
+    try:
+        return int(float(value))
+    except (TypeError, ValueError):
+        return default
+
+
+def first_present(*values: object, default: object = "") -> object:
+    for value in values:
+        if value not in (None, ""):
+            return value
+    return default
+
+
+def classify(days: int, requests: int) -> tuple[str, str, str]:
+    if requests >= 100 or days <= 7:
+        return "hot", "STANDARD", "frequently accessed"
+    if days <= 30 or requests >= 20:
+        return "warm", "STANDARD_IA", "moderate access frequency"
+    if days <= 90 or requests >= 1:
+        return "cold", "GLACIER_IR", "infrequent access"
+    return "archive", "DEEP_ARCHIVE", "cold archive candidate"
+
+
+def weight(storage_class: str) -> float:
+    return WEIGHTS.get(storage_class.upper(), 1.0)
+
+
+def load_inventory() -> list[dict[str, object]]:
+    raw = ""
+    input_path = pathlib.Path(input_file) if input_file else None
+
+    if input_path and input_path.exists():
+        raw = input_path.read_text()
+    elif not sys.stdin.isatty():
+        raw = sys.stdin.read()
+
+    if not raw.strip():
+        return []
+
+    payload = json.loads(raw)
+    if isinstance(payload, list):
+        return [item for item in payload if isinstance(item, dict)]
+    if isinstance(payload, dict):
+        for key in ("objects", "items", "records", "data"):
+            value = payload.get(key)
+            if isinstance(value, list):
+                return [item for item in value if isinstance(item, dict)]
+    return []
+
+
+def normalize(inventory: list[dict[str, object]]) -> list[dict[str, object]]:
+    normalized: list[dict[str, object]] = []
+    for entry in inventory:
+        key = str(
+            first_present(
+                entry.get("key"),
+                entry.get("Key"),
+                entry.get("object_key"),
+                entry.get("name"),
+                default="",
+            )
+        )
+        if not key:
+            continue
+
+        bucket = str(
+            first_present(entry.get("bucket"), entry.get("Bucket"), default=default_bucket)
+        )
+        storage_class = str(first_present(entry.get("storage_class"), entry.get("StorageClass"), default="STANDARD")).upper()
+        size_bytes = to_int(first_present(entry.get("size_bytes"), entry.get("Size"), entry.get("size"), default=0))
+        last_accessed_days = to_int(
+            first_present(
+                entry.get("last_accessed_days"),
+                entry.get("days_since_last_access"),
+                entry.get("age_days"),
+                default=0,
+            )
+        )
+        request_count_30d = to_int(
+            first_present(
+                entry.get("request_count_30d"),
+                entry.get("requests_30d"),
+                entry.get("access_count_30d"),
+                default=0,
+            )
+        )
+
+        tier, recommended_storage_class, reason = classify(last_accessed_days, request_count_30d)
+        current_weight = weight(storage_class)
+        recommended_weight = weight(recommended_storage_class)
+
+        restore_command = None
+        if recommended_storage_class in {"GLACIER_IR", "DEEP_ARCHIVE"}:
+            restore_request = json.dumps(
+                {"Days": 7, "GlacierJobParameters": {"Tier": "Standard"}},
+                separators=(",", ":"),
+            )
+            restore_command = (
+                f"aws s3api restore-object --bucket {shlex.quote(bucket)} "
+                f"--key {shlex.quote(key)} --restore-request {shlex.quote(restore_request)}"
+            )
+
+        normalized.append(
+            {
+                "bucket": bucket,
+                "key": key,
+                "storage_class": storage_class,
+                "size_bytes": size_bytes,
+                "last_accessed_days": last_accessed_days,
+                "request_count_30d": request_count_30d,
+                "tier": tier,
+                "recommended_storage_class": recommended_storage_class,
+                "reason": reason,
+                "storage_class_changed": storage_class != recommended_storage_class,
+                "current_relative_cost_units": round(size_bytes * current_weight, 2),
+                "optimized_relative_cost_units": round(size_bytes * recommended_weight, 2),
+                "restore_command": restore_command,
+            }
+        )
+    return normalized
+
+
+def summarize(objects: list[dict[str, object]]) -> dict[str, object]:
+    summary = {
+        "objects": len(objects),
+        "bytes": 0,
+        "current_relative_cost_units": 0.0,
+        "optimized_relative_cost_units": 0.0,
+    }
+    tier_breakdown = {
+        "hot": {"objects": 0, "bytes": 0},
+        "warm": {"objects": 0, "bytes": 0},
+        "cold": {"objects": 0, "bytes": 0},
+        "archive": {"objects": 0, "bytes": 0},
+    }
+    recommendations = []
+    restore_candidates = []
+
+    for obj in objects:
+        summary["bytes"] += obj["size_bytes"]
+        summary["current_relative_cost_units"] += obj["current_relative_cost_units"]
+        summary["optimized_relative_cost_units"] += obj["optimized_relative_cost_units"]
+
+        tier_breakdown[obj["tier"]]["objects"] += 1
+        tier_breakdown[obj["tier"]]["bytes"] += obj["size_bytes"]
+
+        if obj["storage_class_changed"]:
+            recommendations.append(
+                {
+                    "bucket": obj["bucket"],
+                    "key": obj["key"],
+                    "current_storage_class": obj["storage_class"],
+                    "recommended_storage_class": obj["recommended_storage_class"],
+                    "reason": obj["reason"],
+                }
+            )
+
+        if obj["restore_command"]:
+            restore_candidates.append(
+                {
+                    "bucket": obj["bucket"],
+                    "key": obj["key"],
+                    "recommended_storage_class": obj["recommended_storage_class"],
+                    "restore_command": obj["restore_command"],
+                }
+            )
+
+    savings = 0.0
+    if summary["current_relative_cost_units"] > 0:
+        savings = round(
+            (1 - summary["optimized_relative_cost_units"] / summary["current_relative_cost_units"]) * 100,
+            2,
+        )
+
+    lifecycle_recommendation = DEFAULT_LIFECYCLE_RECOMMENDATION
+    if restore_key:
+        restore_candidates = [item for item in restore_candidates if item["key"] == restore_key]
+
+    return {
+        "generated_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
+        "source": input_file or "stdin",
+        "bucket": default_bucket,
+        "summary": {
+            **summary,
+            "estimated_savings_percent": savings,
+        },
+        "tier_breakdown": tier_breakdown,
+        "recommendations": recommendations,
+        "restore_candidates": restore_candidates,
+        "lifecycle_recommendation": lifecycle_recommendation,
+    }
+
+
+report = summarize(normalize(load_inventory()))
+pathlib.Path(output_file).write_text(json.dumps(report, indent=2, sort_keys=True) + "\n")
+print(json.dumps({
+    "report_file": output_file,
+    "objects": report["summary"]["objects"],
+    "estimated_savings_percent": report["summary"]["estimated_savings_percent"],
+    "restore_candidates": len(report["restore_candidates"]),
+}, sort_keys=True))
+PY
diff --git a/infrastructure/terraform/s3-tiering.tf b/infrastructure/terraform/s3-tiering.tf
new file mode 100644
index 00000000..69ed5d2a
--- /dev/null
+++ b/infrastructure/terraform/s3-tiering.tf
@@ -0,0 +1,41 @@
+resource "aws_s3_bucket_lifecycle_configuration" "backups" {
+  bucket = aws_s3_bucket.backups.id
+
+  rule {
+    id     = "tier-backups-by-age"
+    status = "Enabled"
+
+    transition {
+      days          = 30
+      storage_class = "STANDARD_IA"
+    }
+
+    transition {
+      days          = 90
+      storage_class = "GLACIER_IR"
+    }
+
+    transition {
+      days          = 180
+      storage_class = "DEEP_ARCHIVE"
+    }
+
+    noncurrent_version_transition {
+      noncurrent_days = 30
+      storage_class   = "STANDARD_IA"
+    }
+
+    noncurrent_version_transition {
+      noncurrent_days = 90
+      storage_class   = "GLACIER_IR"
+    }
+
+    noncurrent_version_expiration {
+      noncurrent_days = 365
+    }
+
+    expiration {
+      days = 365
+    }
+  }
+}