From 293beceeefad1b47973b8b4f050b9390a0c83d46 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Mon, 22 Jun 2026 11:17:31 +0200 Subject: [PATCH 1/2] config: add flag to update publication date for new version --- cds_migrator_kit/errors.py | 1 + cds_migrator_kit/rdm/records/load/load.py | 9 ++++++++- cds_migrator_kit/rdm/records/transform/transform.py | 5 +++-- .../rdm/records/transform/xml_processing/rules/base.py | 4 ++-- cds_migrator_kit/runner/runner.py | 4 ++++ 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/cds_migrator_kit/errors.py b/cds_migrator_kit/errors.py index 00453339..5cbe912d 100644 --- a/cds_migrator_kit/errors.py +++ b/cds_migrator_kit/errors.py @@ -50,6 +50,7 @@ class MultipleModelsMatched(CDSMigrationException): description = "[Record matched multiple models]" + class UnexpectedValue(CDSMigrationException): """The corresponding value is unexpected.""" diff --git a/cds_migrator_kit/rdm/records/load/load.py b/cds_migrator_kit/rdm/records/load/load.py index 120dc168..ed5b4e62 100644 --- a/cds_migrator_kit/rdm/records/load/load.py +++ b/cds_migrator_kit/rdm/records/load/load.py @@ -61,6 +61,7 @@ def __init__( dry_run=False, legacy_pids_to_redirect=None, collection=None, + update_publication_date=True, migration_logger=None, record_state_logger=None, ): @@ -69,6 +70,7 @@ def __init__( self.legacy_pids_to_redirect = {} self.clc_sync = False self.collection = collection + self.update_publication_date = update_publication_date self.migration_logger = migration_logger self.record_state_logger = record_state_logger if legacy_pids_to_redirect is not None: @@ -455,7 +457,6 @@ def _pre_publish(self, identity, entry, version, draft, uow): """Create and process draft before publish.""" versions = entry["versions"] files = versions[version]["files"] - publication_date = versions[version]["publication_date"] access = versions[version]["access"] if version == 1 or (version > 1 and draft is None): @@ -487,6 +488,12 @@ def _pre_publish(self, identity, entry, version, draft, uow): identity, draft["id"], uow=uow ) draft_dict = draft.to_dict() + if not self.update_publication_date: + publication_date = arrow.get( + entry["record"]["json"]["metadata"]["publication_date"] + ) + else: + publication_date = versions[version]["publication_date"] missing_data = { **draft_dict, "metadata": { diff --git a/cds_migrator_kit/rdm/records/transform/transform.py b/cds_migrator_kit/rdm/records/transform/transform.py index 0afa321d..e5439114 100644 --- a/cds_migrator_kit/rdm/records/transform/transform.py +++ b/cds_migrator_kit/rdm/records/transform/transform.py @@ -35,7 +35,8 @@ MissingRequiredField, RecordFlaggedCuration, RestrictedFileDetected, - UnexpectedValue, MultipleModelsMatched, + UnexpectedValue, + MultipleModelsMatched, ) from cds_migrator_kit.rdm.migration_config import ( RDM_RECORDS_IDENTIFIERS_SCHEMES, @@ -847,7 +848,7 @@ def _transform(self, entry): UnexpectedValue, ManualImportRequired, MissingRequiredField, - MultipleModelsMatched + MultipleModelsMatched, ) as e: migration_logger.add_log(e, record=entry) diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py index 188d612c..826555ff 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py @@ -816,10 +816,10 @@ def related_identifiers_787(self, key, value): "relation_type": {"id": "references"}, "resource_type": {"id": "publication-conferencepaper"}, }, - "article":{ + "article": { "relation_type": {"id": "references"}, "resource_type": {"id": "publication-article"}, - } + }, } if recid: diff --git a/cds_migrator_kit/runner/runner.py b/cds_migrator_kit/runner/runner.py index efe4bba9..05512c93 100644 --- a/cds_migrator_kit/runner/runner.py +++ b/cds_migrator_kit/runner/runner.py @@ -58,6 +58,9 @@ def __init__( stream_config = config.get(definition.name) or {} self.data_dir = Path(stream_config[collection].get("data_dir")) self.restricted = stream_config[collection].get("restricted", False) + self.update_publication_date = stream_config[collection].get( + "update_publication_date", True + ) self.access_grants_view = stream_config[collection].get( "access_grants_view", False ) @@ -112,6 +115,7 @@ def __init__( tmp_dir=tmp_dir, dry_run=dry_run, collection=collection, + update_publication_date=self.update_publication_date, migration_logger=self.migration_logger, record_state_logger=self.record_state_logger, **stream_config[collection].get("load", {}), From ed0564a2b6c76721364ecb77582f55e544e771f9 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Mon, 22 Jun 2026 15:21:57 +0200 Subject: [PATCH 2/2] tests: added tests for the publication date config --- cds_migrator_kit/rdm/records/load/load.py | 8 +- cds_migrator_kit/runner/runner.py | 8 +- tests/cds-rdm/helpers.py | 15 ++++ tests/cds-rdm/test_full_migration.py | 4 +- .../test_new_version_publication_date.py | 74 +++++++++++++++++++ 5 files changed, 100 insertions(+), 9 deletions(-) create mode 100644 tests/cds-rdm/test_new_version_publication_date.py diff --git a/cds_migrator_kit/rdm/records/load/load.py b/cds_migrator_kit/rdm/records/load/load.py index ed5b4e62..a1a1bed9 100644 --- a/cds_migrator_kit/rdm/records/load/load.py +++ b/cds_migrator_kit/rdm/records/load/load.py @@ -61,7 +61,7 @@ def __init__( dry_run=False, legacy_pids_to_redirect=None, collection=None, - update_publication_date=True, + update_new_version_publication_date=True, migration_logger=None, record_state_logger=None, ): @@ -70,7 +70,7 @@ def __init__( self.legacy_pids_to_redirect = {} self.clc_sync = False self.collection = collection - self.update_publication_date = update_publication_date + self.update_new_version_publication_date = update_new_version_publication_date self.migration_logger = migration_logger self.record_state_logger = record_state_logger if legacy_pids_to_redirect is not None: @@ -488,7 +488,7 @@ def _pre_publish(self, identity, entry, version, draft, uow): identity, draft["id"], uow=uow ) draft_dict = draft.to_dict() - if not self.update_publication_date: + if not self.update_new_version_publication_date: publication_date = arrow.get( entry["record"]["json"]["metadata"]["publication_date"] ) @@ -534,7 +534,7 @@ def _load_versions(self, entry, uow): ) # Run after publish fixes self._after_publish(identity, published_record, entry, version, uow) - records.append(published_record._record) + records.append(published_record._record) if records: record_state_context = self._load_record_state(legacy_recid, records) diff --git a/cds_migrator_kit/runner/runner.py b/cds_migrator_kit/runner/runner.py index 05512c93..54528805 100644 --- a/cds_migrator_kit/runner/runner.py +++ b/cds_migrator_kit/runner/runner.py @@ -58,9 +58,9 @@ def __init__( stream_config = config.get(definition.name) or {} self.data_dir = Path(stream_config[collection].get("data_dir")) self.restricted = stream_config[collection].get("restricted", False) - self.update_publication_date = stream_config[collection].get( - "update_publication_date", True - ) + self.update_new_version_publication_date = stream_config[ + collection + ].get("update_new_version_publication_date", True) self.access_grants_view = stream_config[collection].get( "access_grants_view", False ) @@ -115,7 +115,7 @@ def __init__( tmp_dir=tmp_dir, dry_run=dry_run, collection=collection, - update_publication_date=self.update_publication_date, + update_new_version_publication_date=self.update_new_version_publication_date, migration_logger=self.migration_logger, record_state_logger=self.record_state_logger, **stream_config[collection].get("load", {}), diff --git a/tests/cds-rdm/helpers.py b/tests/cds-rdm/helpers.py index d134f58f..bbec7aee 100644 --- a/tests/cds-rdm/helpers.py +++ b/tests/cds-rdm/helpers.py @@ -116,6 +116,21 @@ def config(mocker, community, orcid_name_data): "legacy_pids_to_redirect": "cds_migrator_kit/rdm/data/summer_student_reports/duplicated_pids.json" }, }, + "sspn_publication_date_consistency": { + "data_dir": "tests/cds-rdm/data/sspn", + "tmp_dir": "tests/cds-rdm/data/sspn", + "log_dir": "tests/cds-rdm/data/log/sspn", + "extract": {"dirpath": "tests/cds-rdm/data/sspn/dumps/"}, + "update_new_version_publication_date": False, + "transform": { + "files_dump_dir": "tests/cds-rdm/data/sspn/files/", + "missing_users": "tests/cds-rdm/data/users", + "communities_ids": [f"{str(community.id)}"], + }, + "load": { + "legacy_pids_to_redirect": "cds_migrator_kit/rdm/data/summer_student_reports/duplicated_pids.json" + }, + }, "bulletin_issue": { "data_dir": "tests/cds-rdm/data/bulletin_issue", "tmp_dir": "tests/cds-rdm/data/bulletin_issue", diff --git a/tests/cds-rdm/test_full_migration.py b/tests/cds-rdm/test_full_migration.py index 2971e5f1..a444fcc9 100644 --- a/tests/cds-rdm/test_full_migration.py +++ b/tests/cds-rdm/test_full_migration.py @@ -241,11 +241,13 @@ def multiple_versions(record, record_state): ) dict_first_version = first_version.to_dict() # It matches record created date instead of the file creation date - assert dict_first_version["created"] == "2024-02-19T12:42:58+00:00" + assert dict_first_version["created"] == "2024-02-19T13:51:23+00:00" + assert dict_first_version["metadata"]["publication_date"] == "2022-08-31" assert dict_rec["versions"]["index"] == 2 # Check that the record creation date matches the files creation date assert dict_rec["created"] == "2024-02-19T12:47:01+00:00" + assert dict_rec["metadata"]["publication_date"] == "2024-02-19" def multiple_versions_with_cs(record): diff --git a/tests/cds-rdm/test_new_version_publication_date.py b/tests/cds-rdm/test_new_version_publication_date.py new file mode 100644 index 00000000..fa8dcb33 --- /dev/null +++ b/tests/cds-rdm/test_new_version_publication_date.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""Tests suites.""" + +import json +from pathlib import Path + +from cds_rdm.legacy.models import CDSMigrationLegacyRecord +from helpers import config +from invenio_access.permissions import system_identity +from invenio_rdm_records.proxies import current_rdm_records_service + +from cds_migrator_kit.rdm.records.streams import RecordStreamDefinition +from cds_migrator_kit.runner.runner import Runner + + +def publication_date_consistency_across_versions(record, record_state): + """2889522.""" + + dict_rec = record.to_dict() + + all_dates = [] + + for record_version in record_state["versions"]: + rec = current_rdm_records_service.read( + system_identity, record_version["new_recid"] + ) + dict_version = rec.to_dict() + + all_dates.append(dict_version["metadata"]["publication_date"]) + assert len(all_dates) > 0 + + # Check all versions have the same publication date + assert len(set(all_dates)) == 1 + assert dict_rec["metadata"]["publication_date"] == all_dates[0] + + +def test_new_version_publication_date( + test_app, + orcid_name_data, + community, + mocker, + groups, +): + + stream_config = config(mocker, community, orcid_name_data) + + runner = Runner( + stream_definitions=[RecordStreamDefinition], + config_filepath=Path(stream_config).absolute(), + dry_run=False, + collection="sspn_publication_date_consistency", + keep_logs=False, + ) + runner.run() + + with open( + "tests/cds-rdm/tmp/logs/sspn_publication_date_consistency/rdm_records_state.json", + "r", + ) as state_logs: + records = json.load(state_logs) + + for record in records: + + loaded_rec = current_rdm_records_service.read( + system_identity, record["latest_version"] + ) + if record["legacy_recid"] == "2889522": + publication_date_consistency_across_versions(loaded_rec, record)