Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 131 additions & 27 deletions scripts/migrate_CERNBull_journals.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
"""
Migrates CERN Bulletin journals form legacy (stored as DB rows) to RDM.
Then links the related periodical articles to the newly created periodical issue.

To generate the output CSV file, run the following:
migrate_bull_issues()

The output CSV (migrated issues) file will be created in the current directory.

Then, run the following to link the related articles:
link_related_articles()
"""

import csv
from copy import deepcopy
import os

import arrow
from invenio_access.permissions import system_identity
from invenio_communities.proxies import current_communities
from invenio_db import db
from invenio_rdm_records.proxies import current_rdm_records_service
from invenio_search.engine import dsl
from sqlalchemy.orm.exc import StaleDataError

filepath = "CERNBulletinJournals.csv"
Expand All @@ -14,13 +28,40 @@


def migrate_bull_issues():
migrated = load_migrated_issues()
with open(filepath, mode="r", newline="", encoding="utf-8") as infile, open(
outfilepath, mode="w", newline="", encoding="utf-8"
outfilepath, mode="a", newline="", encoding="utf-8"
) as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
id_journal, issue_number, issue_display, date_released, date_announced = row
if not date_released:
print(f"Skipping {issue_number} because date released is missing")
continue

if issue_number in migrated:
print(
f"Skipping {issue_number}, already migrated as {migrated[issue_number]}"
)
continue

existing_pid = find_existing_issue_record(issue_number)
if existing_pid:
print(f"Skipping {issue_number}, found existing record {existing_pid}")
writer.writerow(
[
id_journal,
issue_number,
issue_display,
date_released,
date_announced,
existing_pid,
]
)
migrated[issue_number] = existing_pid
continue

print(f"Processing {issue_number}...")
data = {
"metadata": {
Expand Down Expand Up @@ -91,10 +132,42 @@ def migrate_bull_issues():
record.id,
]
)
migrated[issue_number] = record.id
except Exception as e:
raise e


def load_migrated_issues():
"""Load already migrated issues from the output CSV."""
migrated = {}
if not os.path.exists(outfilepath):
return migrated
with open(outfilepath, mode="r", newline="", encoding="utf-8") as file:
reader = csv.reader(file)
for row in reader:
if len(row) < 6:
continue
issue_number, record_pid = row[1], row[5]
migrated[issue_number] = record_pid
return migrated


def find_existing_issue_record(issue_number):
"""Find an already created bulletin issue record by issue number."""
issue_number_query = issue_number.replace("/", "\\/")
results = current_rdm_records_service.scan(
system_identity,
q=(
f'metadata.additional_descriptions.description:"{issue_number_query}" '
"AND metadata.resource_type.id:publication-periodicalissue"
),
)
hits = list(results)
if not hits:
return None
return hits[0]["id"]


def link_related_articles():
with open(outfilepath, mode="r", newline="", encoding="utf-8") as file:
reader = csv.reader(file)
Expand All @@ -107,37 +180,68 @@ def link_related_articles():
date_announced,
record_pid,
) = row
if not record_pid:
continue

issue_number = issue_number.replace("/", "\\/")
# The extra filter is used to find the related articles for the given issue number.
# For example, it matches the March 2026 issue with the following issue numbers in the custom field "journal:journal.issue":
# 2/2026-3/2026-4/2026-5/2026
# 02/2026-03/2026-04/2026-05/2026
# 2/2026-3/2026
# 03/2026-04/2026
# 03/2026
# 3/2026
extra_filter = dsl.Q(
"regexp",
**{
"custom_fields.journal:journal.issue.keyword": {
"value": f"(.*-)?0?{issue_number}(-.*)?",
"flags": "NONE",
}
},
)
results = current_rdm_records_service.scan(
system_identity,
q=f"custom_fields.journal\:journal.issue:*{issue_number}*",
q="metadata.resource_type.id:publication-periodicalarticle",
extra_filter=extra_filter,
)
list_res = list(results)
print(
f"Found {len(list_res)} related articles for issue {issue_number} {record_pid}"
)
for hit in list_res:
data = {
"identifier": record_pid,
"relation_type": {"id": "ispublishedin"},
"scheme": "cds",
"resource_type": {"id": "publication-periodicalissue"},
}
record = current_rdm_records_service.read(system_identity, hit["id"])
_draft = current_rdm_records_service.edit(system_identity, record["id"])

update_data = deepcopy(_draft.data)
if "related_identifiers" not in update_data["metadata"]:
update_data["metadata"]["related_identifiers"] = []
if data not in record.data["metadata"].get("related_identifiers", []):
update_data["metadata"]["related_identifiers"].append(data)
draft = current_rdm_records_service.update_draft(
system_identity, _draft["id"], update_data
)
record = current_rdm_records_service.publish(
system_identity, draft["id"]
)
print(f"Linked {hit['id']} to {record_pid}")
else:
print(f"Skipped {hit['id']} to {record_pid}")
link_article_to_issue(hit["id"], record_pid)


def link_article_to_issue(hit_id, record_pid):
"""Append an ispublishedin link to a bulletin issue if not already present."""
record = current_rdm_records_service.read(system_identity, hit_id)
related_identifiers = record.data["metadata"].get("related_identifiers", [])

existing_rel_ids = {
rel_id["identifier"]
for rel_id in related_identifiers
if (
rel_id.get("relation_type", {}).get("id") == "ispublishedin"
and rel_id.get("scheme") == "cds"
and rel_id.get("resource_type", {}).get("id") == "publication-periodicalissue"
)
}
if record_pid in existing_rel_ids:
print(f"Skipped {hit_id}, already linked to {record_pid}")
return

draft = current_rdm_records_service.edit(system_identity, record["id"])
draft.data["metadata"].setdefault("related_identifiers", []).append(
{
"identifier": record_pid,
"relation_type": {"id": "ispublishedin"},
"scheme": "cds",
"resource_type": {"id": "publication-periodicalissue"},
}
)
draft = current_rdm_records_service.update_draft(
system_identity, draft.id, draft.data
)
current_rdm_records_service.publish(system_identity, draft.id)
print(f"Linked {hit_id} to {record_pid}")
Loading