geobtaa
diff --git a/‎Makefile‎
Lines changed: 59 additions & 18 deletions b/‎Makefile‎
Lines changed: 59 additions & 18 deletions
diff --git a/‎backend/app/services/ogm_harvest/importer.py‎
Lines changed: 36 additions & 0 deletions b/‎backend/app/services/ogm_harvest/importer.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎backend/app/services/relationship_sync.py‎
Lines changed: 142 additions & 0 deletions b/‎backend/app/services/relationship_sync.py‎
Lines changed: 142 additions & 0 deletions
@@ -58,7 +58,9 @@ GBL_ADMIN_REMOTE_DIR ?= /opt/data/pgdump
 GBL_ADMIN_DUMP_GLOB ?= pgdump-geoportal_production-*.sql.gz
 GBL_ADMIN_LOCAL_DIR ?= tmp
 GBL_ADMIN_SQL_GLOB ?= pgdump-geoportal_production-*.sql
+GBL_ADMIN_RETAIN_DBS ?= 2
 GBL_ADMIN_IMPORT_CONFLICT ?= update
+GBL_ADMIN_RETIRE_MISSING ?= false
 GBL_ADMIN_DISTRIBUTIONS_BATCH_SIZE ?= 2000
 KAMAL_APP_ROLE ?= web
 KAMAL_PYTHON ?= /opt/venv/bin/python
@@ -415,7 +417,7 @@ gbl-admin-db-unzip: ## Decompress latest GBL Admin dump
 	gunzip -c "$$LOCAL_GZ" > "$$LOCAL_SQL" || { echo "ERROR: gunzip failed (check disk space)."; exit 1; }; \
 	echo "Decompressed SQL: $$LOCAL_SQL"
 
-# Restore production GBL Admin dump to local ParadeDB. Uses .sql if present, otherwise streams from .gz (no extra disk).
+# Restore production GBL Admin dump to local ParadeDB. Uses the newest local .sql or .sql.gz dump.
 gbl-admin-db-restore: ## Restore GBL Admin dump to local ParadeDB
 	@echo "Restoring production GBL Admin SQL into local ParadeDB..."
 	@if ! command -v docker >/dev/null 2>&1; then \
@@ -427,19 +429,30 @@ gbl-admin-db-restore: ## Restore GBL Admin dump to local ParadeDB
 		echo "Start it with: docker compose up -d paradedb"; \
 		exit 1; \
 	fi
-	@LOCAL_SQL=$$(ls -1t "$(GBL_ADMIN_LOCAL_DIR)"/$(GBL_ADMIN_SQL_GLOB) 2>/dev/null | head -n 1); \
-	LOCAL_GZ=$$(ls -1t "$(GBL_ADMIN_LOCAL_DIR)"/$(GBL_ADMIN_DUMP_GLOB) 2>/dev/null | head -n 1); \
-	if [ -n "$$LOCAL_SQL" ]; then \
-		SOURCE="$$LOCAL_SQL"; \
-		DUMP_DATE=$$(basename "$$LOCAL_SQL" | sed -E 's/^pgdump-geoportal_production-([0-9]{8})\.sql$$/\1/'); \
-	elif [ -n "$$LOCAL_GZ" ]; then \
-		SOURCE="$$LOCAL_GZ"; \
-		DUMP_DATE=$$(basename "$$LOCAL_GZ" | sed -E 's/^pgdump-geoportal_production-([0-9]{8})\.sql\.gz$$/\1/'); \
-	else \
+	@SOURCE=$$( \
+		for file in "$(GBL_ADMIN_LOCAL_DIR)"/$(GBL_ADMIN_SQL_GLOB) "$(GBL_ADMIN_LOCAL_DIR)"/$(GBL_ADMIN_DUMP_GLOB); do \
+			[ -f "$$file" ] || continue; \
+			MTIME=$$(stat -f %m "$$file" 2>/dev/null || stat -c %Y "$$file" 2>/dev/null); \
+			[ -n "$$MTIME" ] || continue; \
+			printf "%s\t%s\n" "$$MTIME" "$$file"; \
+		done | sort -nr | head -n 1 | cut -f2- \
+	); \
+	if [ -z "$$SOURCE" ]; then \
 		echo "ERROR: No dump found in $(GBL_ADMIN_LOCAL_DIR) (need $(GBL_ADMIN_SQL_GLOB) or $(GBL_ADMIN_DUMP_GLOB))."; \
 		echo "Run 'make gbl-admin-db-download' first."; \
 		exit 1; \
 	fi; \
+	case "$$SOURCE" in \
+		*.sql) \
+			RESTORE_MODE="sql"; \
+			DUMP_DATE=$$(basename "$$SOURCE" | sed -E 's/^pgdump-geoportal_production-([0-9]{8})\.sql$$/\1/') ;; \
+		*.sql.gz) \
+			RESTORE_MODE="gz"; \
+			DUMP_DATE=$$(basename "$$SOURCE" | sed -E 's/^pgdump-geoportal_production-([0-9]{8})\.sql\.gz$$/\1/') ;; \
+		*) \
+			echo "ERROR: Unrecognized dump filename: $$SOURCE"; \
+			exit 1 ;; \
+	esac; \
 	if ! echo "$$DUMP_DATE" | grep -Eq '^[0-9]{8}$$'; then \
 		echo "ERROR: Could not parse dump date from $$SOURCE."; \
 		exit 1; \
@@ -450,12 +463,13 @@ gbl-admin-db-restore: ## Restore GBL Admin dump to local ParadeDB
 	docker compose exec -T paradedb psql -U postgres -d postgres -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '$$DB_NAME' AND pid <> pg_backend_pid();" || true; \
 	docker compose exec -T paradedb psql -U postgres -d postgres -c "DROP DATABASE IF EXISTS \"$$DB_NAME\";"; \
 	docker compose exec -T paradedb psql -U postgres -d postgres -c "CREATE DATABASE \"$$DB_NAME\" OWNER postgres;"; \
-	if [ -n "$$LOCAL_SQL" ]; then \
-		echo "Restoring from decompressed SQL: $$LOCAL_SQL"; \
-		cat "$$LOCAL_SQL" | docker compose exec -T paradedb psql -U postgres -d "$$DB_NAME"; \
+	echo "Selected newest local dump: $$SOURCE"; \
+	if [ "$$RESTORE_MODE" = "sql" ]; then \
+		echo "Restoring from decompressed SQL: $$SOURCE"; \
+		cat "$$SOURCE" | docker compose exec -T paradedb psql -U postgres -d "$$DB_NAME"; \
 	else \
-		echo "Streaming from compressed dump: $$LOCAL_GZ (no extra disk used)"; \
-		gunzip -c "$$LOCAL_GZ" | docker compose exec -T paradedb psql -U postgres -d "$$DB_NAME"; \
+		echo "Streaming from compressed dump: $$SOURCE (no extra disk used)"; \
+		gunzip -c "$$SOURCE" | docker compose exec -T paradedb psql -U postgres -d "$$DB_NAME"; \
 	fi; \
 	echo "Restore complete."; \
 	echo "Dump used: $$SOURCE"; \
@@ -475,7 +489,22 @@ gbl-admin-db-restore: ## Restore GBL Admin dump to local ParadeDB
 		-e DB_PORT="5432" \
 		-e DB_USER="postgres" \
 		-e DB_PASSWORD="$$DB_PASSWORD" \
-		api bash -lc 'cd /app/backend && python db/migrations/bridge_old_production.py --create-view'
+		api bash -lc 'cd /app/backend && python db/migrations/bridge_old_production.py --create-view'; \
+	if [ "$(GBL_ADMIN_RETAIN_DBS)" -lt 1 ]; then \
+		echo "ERROR: GBL_ADMIN_RETAIN_DBS must be at least 1."; \
+		exit 1; \
+	fi; \
+	PRUNE_DBS=$$(docker compose exec -T paradedb psql -U postgres -d postgres -Atc "WITH ranked AS ( SELECT datname, ROW_NUMBER() OVER ( ORDER BY CASE WHEN datname = '$$DB_NAME' THEN 0 ELSE 1 END, datname DESC ) AS rn FROM pg_database WHERE datname LIKE 'geoportal_production_%' ) SELECT datname FROM ranked WHERE rn > $(GBL_ADMIN_RETAIN_DBS);"); \
+	if [ -n "$$PRUNE_DBS" ]; then \
+		echo "Pruning older restored GBL Admin databases (retaining $(GBL_ADMIN_RETAIN_DBS))..."; \
+		for PRUNE_DB in $$PRUNE_DBS; do \
+			echo "Dropping old restored DB: $$PRUNE_DB"; \
+			docker compose exec -T paradedb psql -U postgres -d postgres -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '$$PRUNE_DB' AND pid <> pg_backend_pid();" || true; \
+			docker compose exec -T paradedb psql -U postgres -d postgres -c "DROP DATABASE IF EXISTS \"$$PRUNE_DB\";"; \
+		done; \
+	else \
+		echo "No older restored GBL Admin databases to prune."; \
+	fi
 
 # End-to-end: download latest dump and restore (streams from .gz; no decompression to disk)
 gbl-admin-db-sync: gbl-admin-db-download gbl-admin-db-restore ## Download + restore GBL Admin dump
@@ -510,6 +539,12 @@ gbl-admin-db-import-resources: ## Import resources from GBL Admin bridge
 		echo "ERROR: Could not read POSTGRES_PASSWORD from paradedb container."; \
 		exit 1; \
 	fi; \
+	IMPORT_FLAGS="--conflict $(GBL_ADMIN_IMPORT_CONFLICT) --verify"; \
+	case "$(GBL_ADMIN_RETIRE_MISSING)" in \
+		1|true|TRUE|yes|YES) \
+			IMPORT_FLAGS="$$IMPORT_FLAGS --retire-missing"; \
+			echo "Missing resources will be marked retired after import." ;; \
+	esac; \
 	echo "OLD_DB_NAME=$$RESOLVED_OLD_DB_NAME"; \
 	docker compose exec -T \
 		-e OLD_DB_NAME="$$RESOLVED_OLD_DB_NAME" \
@@ -518,7 +553,7 @@ gbl-admin-db-import-resources: ## Import resources from GBL Admin bridge
 		-e DB_PORT="5432" \
 		-e DB_USER="postgres" \
 		-e DB_PASSWORD="$$DB_PASSWORD" \
-		api bash -lc 'cd /app/backend && python db/migrations/import_from_old_production.py --conflict $(GBL_ADMIN_IMPORT_CONFLICT) --verify'
+		api bash -lc "cd /app/backend && python db/migrations/import_from_old_production.py $$IMPORT_FLAGS"
 
 # Populate resource_distributions from legacy document_distributions.
 # Uses the latest restored geoportal_production_* DB if OLD_DB_NAME is unset.
@@ -589,7 +624,13 @@ populate-data-dictionaries: ## Populate data dictionaries from legacy tables
 		api bash -lc 'cd /app/backend && python db/migrations/migrate_resource_data_dictionaries.py'
 
 # Full GBL Admin import pipeline after restore.
-gbl-admin-db-import-all: gbl-admin-db-add-latest-btaa-fields gbl-admin-db-import-resources populate-distributions populate-data-dictionaries populate-relationships reindex ## Full GBL Admin import pipeline
+gbl-admin-db-import-all: ## Full GBL Admin import pipeline
+	@$(MAKE) gbl-admin-db-add-latest-btaa-fields
+	@$(MAKE) gbl-admin-db-import-resources GBL_ADMIN_RETIRE_MISSING=true
+	@$(MAKE) populate-distributions
+	@$(MAKE) populate-data-dictionaries
+	@$(MAKE) populate-relationships
+	@$(MAKE) reindex
 	@echo "GBL Admin full import pipeline complete!"
 
 # Search indexing tasks
 
@@ -15,6 +15,10 @@
 )
 from app.services.ogm_harvest.aardvark_reader import extract_record_id
 from app.services.ogm_harvest.repository import OGMHarvestRepository
+from app.services.relationship_sync import (
+    sync_relationships_for_batch,
+    sync_relationships_for_resource_ids,
+)
 from db.database import database
 from db.models import resources
 
@@ -75,6 +79,16 @@ def _parse_iso_date(value: str) -> Optional[date]:
         return None
 
 
+def _normalize_scalar_string(value: Any) -> Optional[str]:
+    if value is None:
+        return None
+    if isinstance(value, list):
+        values = [str(v).strip() for v in value if str(v).strip()]
+        return values[0] if values else None
+    normalized = str(value).strip()
+    return normalized or None
+
+
 class OGMResourceImporter:
     """
     Import Aardvark records into the `resources` table (UPSERT).
@@ -180,6 +194,12 @@ def _normalize_record(self, record: Dict[str, Any], repo_name: str) -> Dict[str,
             workflow_list = [str(v).strip() for v in out["b1g_harvestWorkflow_s"] if str(v).strip()]
             out["b1g_harvestWorkflow_s"] = workflow_list[0] if workflow_list else None
 
+        publication_state = _normalize_scalar_string(out.get("publication_state"))
+        b1g_publication_state = _normalize_scalar_string(out.get("b1g_publication_state_s"))
+        effective_publication_state = publication_state or b1g_publication_state or "published"
+        out["publication_state"] = effective_publication_state
+        out["b1g_publication_state_s"] = effective_publication_state
+
         # Tag injection
         tags: List[str] = []
         existing = out.get("b1g_adminTags_sm")
@@ -301,6 +321,15 @@ def _add_error_sample(stage: str, error: Exception, rid: Optional[str] = None) -
                             str(dist_err),
                         )
 
+                    try:
+                        await sync_relationships_for_resource_ids([str(rid)])
+                    except Exception as rel_err:
+                        logger.warning(
+                            "Relationship sync failed for %s; continuing. err=%s",
+                            rid,
+                            str(rel_err),
+                        )
+
                     # Mark seen for missing tracking
                     await self.repo.upsert_resource_seen(
                         ogm_repo_name=repo_name,
@@ -433,6 +462,13 @@ async def _flush_rows(rows: List[Dict[str, Any]], seen: List[Dict[str, Any]]) ->
                             "Distribution sync failed for batch; continuing. err=%s",
                             str(dist_err),
                         )
+                    try:
+                        await sync_relationships_for_batch(rows)
+                    except Exception as rel_err:
+                        logger.warning(
+                            "Relationship sync failed for batch; continuing. err=%s",
+                            str(rel_err),
+                        )
                     await self.repo.upsert_resources_seen_batch(repo_name, seen)
                 return len(rows)
             except Exception as e:
 
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, Iterable, List, Sequence, Set, Tuple
+
+from sqlalchemy import delete, or_, select
+
+from db.database import database
+from db.models import resource_relationships, resources
+
+logger = logging.getLogger(__name__)
+
+
+RELATIONSHIP_FAMILIES: Tuple[Tuple[str, str, str], ...] = (
+    ("dct_relation_sm", "dct:relation", "dct:relation"),
+    ("dct_isPartOf_sm", "dct:isPartOf", "dct:hasPart"),
+    ("pcdm_memberOf_sm", "pcdm:memberOf", "pcdm:hasMember"),
+    ("dct_source_sm", "dct:source", "dct:sourceOf"),
+    ("dct_isVersionOf_sm", "dct:isVersionOf", "dct:hasVersion"),
+    ("dct_replaces_sm", "dct:replaces", "dct:isReplacedBy"),
+    ("dct_isReplacedBy_sm", "dct:isReplacedBy", "dct:replaces"),
+)
+
+ALL_RELATIONSHIP_PREDICATES: Tuple[str, ...] = tuple(
+    sorted(
+        {
+            predicate
+            for _, predicate, inverse in RELATIONSHIP_FAMILIES
+            for predicate in (predicate, inverse)
+        }
+    )
+)
+
+
+def _normalize_resource_ids(resource_ids: Sequence[Any]) -> List[str]:
+    normalized: List[str] = []
+    seen: Set[str] = set()
+    for resource_id in resource_ids:
+        value = str(resource_id or "").strip()
+        if not value or value in seen:
+            continue
+        seen.add(value)
+        normalized.append(value)
+    return normalized
+
+
+def _normalize_related_ids(value: Any) -> List[str]:
+    if value is None:
+        return []
+    if isinstance(value, (list, tuple, set)):
+        raw_values = value
+    else:
+        raw_values = [value]
+
+    related_ids: List[str] = []
+    seen: Set[str] = set()
+    for item in raw_values:
+        candidate = str(item or "").strip()
+        if not candidate or candidate in seen:
+            continue
+        seen.add(candidate)
+        related_ids.append(candidate)
+    return related_ids
+
+
+def _build_relationship_rows(
+    rows_by_family: Dict[Tuple[str, str, str], Iterable[Dict[str, Any]]],
+    tracked_ids: Sequence[Any],
+) -> List[Dict[str, str]]:
+    tracked = set(_normalize_resource_ids(tracked_ids))
+    relationships: Set[Tuple[str, str, str]] = set()
+
+    for (field_name, predicate, inverse_predicate), rows in rows_by_family.items():
+        for row in rows:
+            subject_id = str(row.get("id") or "").strip()
+            if not subject_id:
+                continue
+
+            related_ids = _normalize_related_ids(row.get(field_name))
+            for object_id in related_ids:
+                if object_id == subject_id:
+                    continue
+                if subject_id not in tracked and object_id not in tracked:
+                    continue
+                relationships.add((subject_id, predicate, object_id))
+                relationships.add((object_id, inverse_predicate, subject_id))
+
+    return [
+        {"subject_id": subject_id, "predicate": predicate, "object_id": object_id}
+        for subject_id, predicate, object_id in sorted(relationships)
+    ]
+
+
+async def sync_relationships_for_resource_ids(resource_ids: Sequence[Any]) -> int:
+    tracked_ids = _normalize_resource_ids(resource_ids)
+    if not tracked_ids:
+        return 0
+
+    if not database.is_connected:
+        await database.connect()
+
+    await database.execute(
+        delete(resource_relationships).where(
+            resource_relationships.c.predicate.in_(ALL_RELATIONSHIP_PREDICATES),
+            or_(
+                resource_relationships.c.subject_id.in_(tracked_ids),
+                resource_relationships.c.object_id.in_(tracked_ids),
+            ),
+        )
+    )
+
+    rows_by_family: Dict[Tuple[str, str, str], Iterable[Dict[str, Any]]] = {}
+    for family in RELATIONSHIP_FAMILIES:
+        field_name, _, _ = family
+        field_column = resources.c[field_name]
+        query = select(resources.c.id, field_column).where(
+            or_(resources.c.id.in_(tracked_ids), field_column.op("&&")(tracked_ids))
+        )
+        family_rows = await database.fetch_all(query)
+        rows_by_family[family] = [dict(row) for row in family_rows]
+
+    relationship_rows = _build_relationship_rows(rows_by_family, tracked_ids)
+    if not relationship_rows:
+        logger.info(
+            "Relationship sync complete for %d resources: no relationship rows to insert.",
+            len(tracked_ids),
+        )
+        return 0
+
+    await database.execute_many(query=resource_relationships.insert(), values=relationship_rows)
+    logger.info(
+        "Relationship sync complete for %d resources: inserted %d rows.",
+        len(tracked_ids),
+        len(relationship_rows),
+    )
+    return len(relationship_rows)
+
+
+async def sync_relationships_for_batch(resource_rows: Sequence[Dict[str, Any]]) -> int:
+    return await sync_relationships_for_resource_ids(
+        [row.get("id") for row in resource_rows if row.get("id")]
+    )