From 8ab6e6e719c2e9591acd3b39bd7e282663e545df Mon Sep 17 00:00:00 2001 From: prismn Date: Mon, 22 Jun 2026 16:00:45 +0100 Subject: [PATCH] feat: add storage tiering automation --- Backend/src/gists/dto/query-gists.dto.ts | 13 +- Backend/src/gists/gist.repository.ts | 22 +- Backend/src/gists/gists.controller.ts | 3 - Backend/src/gists/gists.service.ts | 37 +-- infrastructure/docs/storage-tiering.md | 76 ++++++ infrastructure/scripts/analyze-access.sh | 305 +++++++++++++++++++++++ infrastructure/terraform/s3-tiering.tf | 41 +++ 7 files changed, 438 insertions(+), 59 deletions(-) create mode 100644 infrastructure/docs/storage-tiering.md create mode 100755 infrastructure/scripts/analyze-access.sh create mode 100644 infrastructure/terraform/s3-tiering.tf diff --git a/Backend/src/gists/dto/query-gists.dto.ts b/Backend/src/gists/dto/query-gists.dto.ts index 3b4a029b..7386f12c 100644 --- a/Backend/src/gists/dto/query-gists.dto.ts +++ b/Backend/src/gists/dto/query-gists.dto.ts @@ -1,17 +1,16 @@ import { ApiProperty, ApiPropertyOptional } from '@nestjs/swagger'; +import { Transform, Type } from 'class-transformer'; import { + IsBoolean, IsLatitude, IsLongitude, - IsOptional, IsNumber, - Min, - Max, + IsOptional, IsString, + Max, MaxLength, - IsBoolean, + Min, } from 'class-validator'; -import { IsLatitude, IsLongitude, IsOptional, IsNumber, Min, Max, IsString, IsBoolean, MaxLength } from 'class-validator'; -import { Type, Transform } from 'class-transformer'; export class QueryGistsDto { @ApiProperty({ description: 'Latitude to search from', example: 9.0579 }) @@ -63,8 +62,6 @@ export class QueryGistsDto { authorAddress?: string; @ApiPropertyOptional({ - description: 'Return count grouped by location_cell for heatmap data', - example: true, description: 'When true, count endpoint returns breakdown by location_cell', example: false, }) diff --git a/Backend/src/gists/gist.repository.ts b/Backend/src/gists/gist.repository.ts index 3a630f06..299d2d9f 100644 --- a/Backend/src/gists/gist.repository.ts +++ b/Backend/src/gists/gist.repository.ts @@ -12,7 +12,7 @@ export interface NearbyQuery { lon: number; radiusMeters?: number; limit?: number; - cursor?: string; // base64 encoded cursor or raw ISO date string + cursor?: string; authorAddress?: string; } @@ -32,13 +32,6 @@ export interface CreateGistData { export class GistRepository { constructor(@InjectDataSource() private readonly dataSource: DataSource) {} - /** - * Persist a new gist row. When a transactional `EntityManager` is supplied - * (e.g. from `GistsService.create`), the INSERT joins the caller's - * transaction so the write can be rolled back atomically. When no manager - * is provided (e.g. from `IndexerService` on the connection pool), the - * INSERT runs in its own implicit transaction. - */ async create(data: CreateGistData, manager?: EntityManager): Promise { const { content, @@ -52,9 +45,7 @@ export class GistRepository { expires_at, } = data; - // Default expiry: 24 hours from now const expiresAt = expires_at ?? new Date(Date.now() + 24 * 60 * 60 * 1000); - const queryRunner = manager ?? this.dataSource; const result = await queryRunner.query( @@ -86,18 +77,15 @@ export class GistRepository { const params: unknown[] = [lon, lat, radiusMeters, limit]; const clauses: string[] = []; - // Issue #604 — exclude expired gists clauses.push(`g.expires_at > NOW()`); if (cursor) { - // Support both base64 encoded cursors and raw ISO strings const decoded = PaginationHelper.decodeCursor(cursor) ?? cursor; params.push(decoded); clauses.push(`g.created_at < $${params.length}`); } if (authorAddress) { - // Case-sensitive exact match — Stellar addresses are encoded payloads params.push(authorAddress); clauses.push(`g.author_address = $${params.length}`); } @@ -129,7 +117,7 @@ export class GistRepository { $3 ) ${extraWhere} - ORDER BY g.created_at DESC + ORDER BY distance_meters ASC, g.created_at DESC LIMIT $4 `, params, @@ -181,7 +169,6 @@ export class GistRepository { return parseInt(row.cnt, 10) > 0; } - /** Issue #604 — delete rows whose expiry has passed (called by cron job). */ async deleteExpired(): Promise { const result = await this.dataSource.query>( `WITH deleted AS (DELETE FROM gists WHERE expires_at <= NOW() RETURNING id) @@ -189,6 +176,7 @@ export class GistRepository { ); return parseInt(result[0].count, 10); } + async countNearby(lat: number, lon: number, radiusMeters: number): Promise { const [row] = await this.dataSource.query>( `SELECT COUNT(*) AS count FROM gists @@ -206,10 +194,6 @@ export class GistRepository { query: Pick, ): Promise> { const { lat, lon, radiusMeters = 500 } = query; - lat: number, - lon: number, - radiusMeters: number, - ): Promise> { const rows = await this.dataSource.query>( `SELECT location_cell, COUNT(*) AS count FROM gists WHERE ST_DWithin( diff --git a/Backend/src/gists/gists.controller.ts b/Backend/src/gists/gists.controller.ts index 535bcb7a..554e5a47 100644 --- a/Backend/src/gists/gists.controller.ts +++ b/Backend/src/gists/gists.controller.ts @@ -37,9 +37,6 @@ export class GistsController { @Get('count') @SkipThrottle() @ApiOperation({ summary: 'Count gists near a location (optionally broken down by cell)' }) - @Get('count') - @SkipThrottle() - @ApiOperation({ summary: 'Count active gists within a radius' }) countNearby(@Query() query: QueryGistsDto) { return this.gistsService.countNearby(query); } diff --git a/Backend/src/gists/gists.service.ts b/Backend/src/gists/gists.service.ts index ee9ebd24..763659b3 100644 --- a/Backend/src/gists/gists.service.ts +++ b/Backend/src/gists/gists.service.ts @@ -13,6 +13,14 @@ import { stripHtml } from '../common/utils/sanitize'; const DEFAULT_TTL_HOURS = 24; +export interface CountNearbyResult { + count: number; + radius: number; + lat: number; + lon: number; + breakdown?: Array<{ cell: string; count: number }>; +} + @Injectable() export class GistsService { private readonly logger = new Logger(GistsService.name); @@ -25,24 +33,8 @@ export class GistsService { private readonly sorobanService: SorobanService, ) {} - /** - * Create a gist end-to-end. - * - * Issue #98 — atomicity: - * - External side-effects (sanitize, geo-encode, IPFS pin, Soroban post) - * happen OUTSIDE the database transaction because they cannot be - * rolled back from Postgres and would just block a connection slot. - * - The actual database INSERT runs inside `dataSource.transaction()` so - * any error thrown during the write rolls back the row atomically and - * future related writes (audit log, related tables) join the same tx. - * - A duplicate `stellar_gist_id` (e.g. retried Soroban post) raises a - * Postgres unique-violation (SQLSTATE 23505); we catch it and return - * the existing row so the API becomes safely idempotent. - */ async create(dto: CreateGistDto): Promise { - // Issue 87 — sanitize content before storing const content = stripHtml(dto.content); - const locationCell = this.geoService.encode(dto.lat, dto.lon); const { cid } = await this.ipfsService.pinJson({ @@ -57,7 +49,6 @@ export class GistsService { this.logger.log(`Gist posted → cell=${locationCell} cid=${cid} gistId=${gistId}`); - // Issue #604 — compute expiry from ttlHours (default 24 h) const ttlHours = dto.ttlHours ?? DEFAULT_TTL_HOURS; const expiresAt = new Date(Date.now() + ttlHours * 60 * 60 * 1000); @@ -81,9 +72,6 @@ export class GistsService { } catch (err) { const code = (err as { code?: string })?.code; if (code === PG_UNIQUE_VIOLATION) { - // Concurrent or retried create with the same on-chain gist ID — the - // winning transaction already persisted the row. Return it so the - // caller observes a logically idempotent success. this.logger.debug( `Gist ${gistId} already indexed — returning existing row (SQLSTATE ${PG_UNIQUE_VIOLATION})`, ); @@ -122,16 +110,7 @@ export class GistsService { return { count: total, radius, lat, lon, breakdown: rows }; } - const count = await this.gistRepository.countNearby({ lat, lon, radiusMeters: radius }); - async countNearby( - query: QueryGistsDto, - ): Promise<{ count: number; radius: number; lat: number; lon: number; breakdown?: Array<{ cell: string; count: number }> }> { - const { lat, lon, radius = 500, breakdown } = query; const count = await this.gistRepository.countNearby(lat, lon, radius); - if (breakdown) { - const cells = await this.gistRepository.countNearbyByCell(lat, lon, radius); - return { count, radius, lat, lon, breakdown: cells }; - } return { count, radius, lat, lon }; } } diff --git a/infrastructure/docs/storage-tiering.md b/infrastructure/docs/storage-tiering.md new file mode 100644 index 00000000..f8a8ea0f --- /dev/null +++ b/infrastructure/docs/storage-tiering.md @@ -0,0 +1,76 @@ +# Storage Tiering Automation + +This repository now has a small storage-tiering workflow for long-lived S3 data: + +- Terraform lifecycle rules for the backup bucket +- An access-analysis script that scores objects by recency and request volume +- Retrieval commands for archived objects +- A JSON report that can be used for cost review + +## Terraform + +The backup bucket is configured to transition through cheaper tiers over time: + +- `STANDARD_IA` after 30 days +- `GLACIER_IR` after 90 days +- `DEEP_ARCHIVE` after 180 days +- noncurrent versions are expired after 365 days + +See [`infrastructure/terraform/s3-tiering.tf`](../terraform/s3-tiering.tf) and the existing bucket definitions in [`infrastructure/terraform/s3-buckets.tf`](../terraform/s3-buckets.tf). + +## Access Analysis + +Use [`infrastructure/scripts/analyze-access.sh`](../scripts/analyze-access.sh) to turn an access inventory into a recommendation report. + +The script expects JSON like: + +```json +[ + { + "key": "backups/2026-06-01.sql.gz", + "bucket": "gistpin-backups", + "storage_class": "STANDARD", + "size_bytes": 12345, + "last_accessed_days": 42, + "request_count_30d": 8 + } +] +``` + +Run it with a file or by piping JSON on stdin: + +```bash +bash infrastructure/scripts/analyze-access.sh \ + --input /tmp/s3-access-inventory.json \ + --bucket gistpin-backups +``` + +The script writes a report to `infrastructure/ci/reports/storage-tiering/` by default. The report includes: + +- object counts per tier +- normalized storage-cost estimates +- objects that should move to a cheaper class +- restore commands for archived objects +- a lifecycle recommendation block for the backup bucket + +## Retrieval + +For objects recommended for archive storage, the report includes restore commands using `aws s3api restore-object`. + +Example: + +```bash +aws s3api restore-object \ + --bucket gistpin-backups \ + --key backups/2026-06-01.sql.gz \ + --restore-request '{"Days":7,"GlacierJobParameters":{"Tier":"Standard"}}' +``` + +## Cost Review + +Treat the generated report as an operational cost review artifact: + +1. Run the access analysis on the latest inventory export. +2. Review objects recommended for tier changes. +3. Apply the lifecycle rules or restore commands as needed. +4. Re-run the report to confirm the savings trend. diff --git a/infrastructure/scripts/analyze-access.sh b/infrastructure/scripts/analyze-access.sh new file mode 100755 index 00000000..dd9f57a1 --- /dev/null +++ b/infrastructure/scripts/analyze-access.sh @@ -0,0 +1,305 @@ +#!/usr/bin/env bash +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +cd "${REPO_ROOT}" + +INPUT_FILE="" +OUTPUT_FILE="${OUTPUT_FILE:-}" +BUCKET_NAME="${BUCKET_NAME:-gistpin-backups}" +RESTORE_KEY="${RESTORE_KEY:-}" +REPORT_DIR="${REPORT_DIR:-infrastructure/ci/reports/storage-tiering}" + +usage() { + cat <<'EOF' +Usage: analyze-access.sh [--input FILE] [--output FILE] [--bucket NAME] [--restore-key KEY] + +Reads a JSON inventory of S3 objects and produces a storage-tiering report. +If no input is provided, the script writes an empty report and exits cleanly. + +Expected JSON shape: +[ + { + "key": "backups/2026-06-01.sql.gz", + "bucket": "gistpin-backups", + "storage_class": "STANDARD", + "size_bytes": 12345, + "last_accessed_days": 42, + "request_count_30d": 8 + } +] +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --input) + INPUT_FILE="${2:-}" + shift 2 + ;; + --output) + OUTPUT_FILE="${2:-}" + shift 2 + ;; + --bucket) + BUCKET_NAME="${2:-}" + shift 2 + ;; + --restore-key) + RESTORE_KEY="${2:-}" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 1 + ;; + esac +done + +if [[ -z "${OUTPUT_FILE}" ]]; then + TIMESTAMP="$(date -u +%Y%m%dT%H%M%SZ)" + OUTPUT_FILE="${REPORT_DIR}/storage-tiering-${TIMESTAMP}.json" +fi + +mkdir -p "$(dirname "${OUTPUT_FILE}")" + +if ! command -v python3 >/dev/null 2>&1; then + echo "python3 is required to run access analysis." >&2 + exit 1 +fi + +python3 - "${INPUT_FILE}" "${OUTPUT_FILE}" "${BUCKET_NAME}" "${RESTORE_KEY}" <<'PY' +from __future__ import annotations + +import json +import pathlib +import shlex +import sys +from datetime import datetime, timezone + +input_file, output_file, default_bucket, restore_key = sys.argv[1:5] + +WEIGHTS = { + "STANDARD": 1.0, + "STANDARD_IA": 0.7, + "GLACIER_IR": 0.25, + "DEEP_ARCHIVE": 0.05, +} + +DEFAULT_LIFECYCLE_RECOMMENDATION = [ + {"days": 30, "storage_class": "STANDARD_IA"}, + {"days": 90, "storage_class": "GLACIER_IR"}, + {"days": 180, "storage_class": "DEEP_ARCHIVE"}, +] + + +def to_int(value: object, default: int = 0) -> int: + if value in (None, ""): + return default + try: + return int(float(value)) + except (TypeError, ValueError): + return default + + +def first_present(*values: object, default: object = "") -> object: + for value in values: + if value not in (None, ""): + return value + return default + + +def classify(days: int, requests: int) -> tuple[str, str, str]: + if requests >= 100 or days <= 7: + return "hot", "STANDARD", "frequently accessed" + if days <= 30 or requests >= 20: + return "warm", "STANDARD_IA", "moderate access frequency" + if days <= 90 or requests >= 1: + return "cold", "GLACIER_IR", "infrequent access" + return "archive", "DEEP_ARCHIVE", "cold archive candidate" + + +def weight(storage_class: str) -> float: + return WEIGHTS.get(storage_class.upper(), 1.0) + + +def load_inventory() -> list[dict[str, object]]: + raw = "" + input_path = pathlib.Path(input_file) if input_file else None + + if input_path and input_path.exists(): + raw = input_path.read_text() + elif not sys.stdin.isatty(): + raw = sys.stdin.read() + + if not raw.strip(): + return [] + + payload = json.loads(raw) + if isinstance(payload, list): + return [item for item in payload if isinstance(item, dict)] + if isinstance(payload, dict): + for key in ("objects", "items", "records", "data"): + value = payload.get(key) + if isinstance(value, list): + return [item for item in value if isinstance(item, dict)] + return [] + + +def normalize(inventory: list[dict[str, object]]) -> list[dict[str, object]]: + normalized: list[dict[str, object]] = [] + for entry in inventory: + key = str( + first_present( + entry.get("key"), + entry.get("Key"), + entry.get("object_key"), + entry.get("name"), + default="", + ) + ) + if not key: + continue + + bucket = str( + first_present(entry.get("bucket"), entry.get("Bucket"), default=default_bucket) + ) + storage_class = str(first_present(entry.get("storage_class"), entry.get("StorageClass"), default="STANDARD")).upper() + size_bytes = to_int(first_present(entry.get("size_bytes"), entry.get("Size"), entry.get("size"), default=0)) + last_accessed_days = to_int( + first_present( + entry.get("last_accessed_days"), + entry.get("days_since_last_access"), + entry.get("age_days"), + default=0, + ) + ) + request_count_30d = to_int( + first_present( + entry.get("request_count_30d"), + entry.get("requests_30d"), + entry.get("access_count_30d"), + default=0, + ) + ) + + tier, recommended_storage_class, reason = classify(last_accessed_days, request_count_30d) + current_weight = weight(storage_class) + recommended_weight = weight(recommended_storage_class) + + restore_command = None + if recommended_storage_class in {"GLACIER_IR", "DEEP_ARCHIVE"}: + restore_request = json.dumps( + {"Days": 7, "GlacierJobParameters": {"Tier": "Standard"}}, + separators=(",", ":"), + ) + restore_command = ( + f"aws s3api restore-object --bucket {shlex.quote(bucket)} " + f"--key {shlex.quote(key)} --restore-request {shlex.quote(restore_request)}" + ) + + normalized.append( + { + "bucket": bucket, + "key": key, + "storage_class": storage_class, + "size_bytes": size_bytes, + "last_accessed_days": last_accessed_days, + "request_count_30d": request_count_30d, + "tier": tier, + "recommended_storage_class": recommended_storage_class, + "reason": reason, + "storage_class_changed": storage_class != recommended_storage_class, + "current_relative_cost_units": round(size_bytes * current_weight, 2), + "optimized_relative_cost_units": round(size_bytes * recommended_weight, 2), + "restore_command": restore_command, + } + ) + return normalized + + +def summarize(objects: list[dict[str, object]]) -> dict[str, object]: + summary = { + "objects": len(objects), + "bytes": 0, + "current_relative_cost_units": 0.0, + "optimized_relative_cost_units": 0.0, + } + tier_breakdown = { + "hot": {"objects": 0, "bytes": 0}, + "warm": {"objects": 0, "bytes": 0}, + "cold": {"objects": 0, "bytes": 0}, + "archive": {"objects": 0, "bytes": 0}, + } + recommendations = [] + restore_candidates = [] + + for obj in objects: + summary["bytes"] += obj["size_bytes"] + summary["current_relative_cost_units"] += obj["current_relative_cost_units"] + summary["optimized_relative_cost_units"] += obj["optimized_relative_cost_units"] + + tier_breakdown[obj["tier"]]["objects"] += 1 + tier_breakdown[obj["tier"]]["bytes"] += obj["size_bytes"] + + if obj["storage_class_changed"]: + recommendations.append( + { + "bucket": obj["bucket"], + "key": obj["key"], + "current_storage_class": obj["storage_class"], + "recommended_storage_class": obj["recommended_storage_class"], + "reason": obj["reason"], + } + ) + + if obj["restore_command"]: + restore_candidates.append( + { + "bucket": obj["bucket"], + "key": obj["key"], + "recommended_storage_class": obj["recommended_storage_class"], + "restore_command": obj["restore_command"], + } + ) + + savings = 0.0 + if summary["current_relative_cost_units"] > 0: + savings = round( + (1 - summary["optimized_relative_cost_units"] / summary["current_relative_cost_units"]) * 100, + 2, + ) + + lifecycle_recommendation = DEFAULT_LIFECYCLE_RECOMMENDATION + if restore_key: + restore_candidates = [item for item in restore_candidates if item["key"] == restore_key] + + return { + "generated_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + "source": input_file or "stdin", + "bucket": default_bucket, + "summary": { + **summary, + "estimated_savings_percent": savings, + }, + "tier_breakdown": tier_breakdown, + "recommendations": recommendations, + "restore_candidates": restore_candidates, + "lifecycle_recommendation": lifecycle_recommendation, + } + + +report = summarize(normalize(load_inventory())) +pathlib.Path(output_file).write_text(json.dumps(report, indent=2, sort_keys=True) + "\n") +print(json.dumps({ + "report_file": output_file, + "objects": report["summary"]["objects"], + "estimated_savings_percent": report["summary"]["estimated_savings_percent"], + "restore_candidates": len(report["restore_candidates"]), +}, sort_keys=True)) +PY diff --git a/infrastructure/terraform/s3-tiering.tf b/infrastructure/terraform/s3-tiering.tf new file mode 100644 index 00000000..69ed5d2a --- /dev/null +++ b/infrastructure/terraform/s3-tiering.tf @@ -0,0 +1,41 @@ +resource "aws_s3_bucket_lifecycle_configuration" "backups" { + bucket = aws_s3_bucket.backups.id + + rule { + id = "tier-backups-by-age" + status = "Enabled" + + transition { + days = 30 + storage_class = "STANDARD_IA" + } + + transition { + days = 90 + storage_class = "GLACIER_IR" + } + + transition { + days = 180 + storage_class = "DEEP_ARCHIVE" + } + + noncurrent_version_transition { + noncurrent_days = 30 + storage_class = "STANDARD_IA" + } + + noncurrent_version_transition { + noncurrent_days = 90 + storage_class = "GLACIER_IR" + } + + noncurrent_version_expiration { + noncurrent_days = 365 + } + + expiration { + days = 365 + } + } +}