From 5b86ec9291dca81888b3afcd0769e426bd6a11d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Wed, 17 Jun 2026 19:08:13 +0200 Subject: [PATCH] fix: preserve file version snapshots during version creation --- .../rdm/records/transform/transform.py | 21 ++- tests/cds-rdm/test_transform_versions.py | 165 ++++++++++++++++++ 2 files changed, 178 insertions(+), 8 deletions(-) create mode 100644 tests/cds-rdm/test_transform_versions.py diff --git a/cds_migrator_kit/rdm/records/transform/transform.py b/cds_migrator_kit/rdm/records/transform/transform.py index c2764f92..0afa321d 100644 --- a/cds_migrator_kit/rdm/records/transform/transform.py +++ b/cds_migrator_kit/rdm/records/transform/transform.py @@ -907,11 +907,7 @@ def compute_access(file, record_access): "meta": file["status"], } - def compute_files(file_dump, versions_dict): - legacy_path_root = Path("/opt/cdsweb/var/data/files/") - tmp_eos_root = Path(self.files_dump_dir) - full_path = Path(file_dump["full_path"]) - + def should_skip_file(file_dump): if file_dump["subformat"] in FILE_SUBFORMATS_TO_DROP: self.migration_logger.add_information( str(file_dump["recid"]), @@ -920,7 +916,7 @@ def compute_files(file_dump, versions_dict): "value": file_dump["full_name"], }, ) - return + return True if not self.plots and file_dump["type"] == "Plot": # skip figures if configuration says so @@ -931,7 +927,7 @@ def compute_files(file_dump, versions_dict): "value": file_dump["full_name"], }, ) - return + return True if file_dump["hidden"]: # skip hidden files self.migration_logger.add_information( @@ -941,6 +937,13 @@ def compute_files(file_dump, versions_dict): "value": file_dump["full_name"], }, ) + return True + return False + + def compute_files(file_dump, versions_dict): + legacy_path_root = Path("/opt/cdsweb/var/data/files/") + tmp_eos_root = Path(self.files_dump_dir) + full_path = Path(file_dump["full_path"]) versions_dict[file_dump["version"]]["files"].update( { @@ -978,6 +981,8 @@ def compute_files(file_dump, versions_dict): _files = entry["files"] record_access = record["access"] for file in _files: + if should_skip_file(file): + continue if file["version"] not in versions: versions[file["version"]] = { "files": {}, @@ -996,7 +1001,7 @@ def compute_files(file_dump, versions_dict): # we need to preserve the file B for version 2 of the record for version in versions.keys(): versioned_files |= versions.get(version, {}).get("files") - versions[version]["files"] = versioned_files + versions[version]["files"] = deepcopy(versioned_files) publication_date = record["json"]["metadata"]["publication_date"] if not versioned_files: diff --git a/tests/cds-rdm/test_transform_versions.py b/tests/cds-rdm/test_transform_versions.py new file mode 100644 index 00000000..e172b5a0 --- /dev/null +++ b/tests/cds-rdm/test_transform_versions.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""Tests for record version file snapshot logic in transform._versions().""" + +from unittest.mock import MagicMock + +import pytest + +from cds_migrator_kit.rdm.records.transform.transform import CDSToRDMRecordTransform + + +def _file_dump( + *, + full_name="draft.pdf", + file_version=1, + file_type="Main", + creation_date="2020-01-01T00:00:00+00:00", + checksum=None, + hidden=False, + subformat="", + recid=123, + bibdocid=1, +): + """Build a minimal legacy file dump entry.""" + checksum = checksum or f"checksum-v{file_version}" + return { + "comment": None, + "status": "", + "version": file_version, + "encoding": None, + "creation_date": creation_date, + "bibdocid": bibdocid, + "mime": "application/pdf", + "full_name": full_name, + "superformat": ".pdf", + "recids_doctype": [[recid, file_type, full_name]], + "path": ( + f"/opt/cdsweb/var/data/files/g{bibdocid}/{bibdocid}/" + f"content.pdf;{file_version}" + ), + "size": 1000, + "license": {}, + "modification_date": creation_date, + "copyright": {}, + "url": f"http://cds.cern.ch/record/{recid}/files/{full_name}", + "checksum": checksum, + "description": None, + "format": ".pdf", + "name": full_name.rsplit(".", 1)[0], + "subformat": subformat, + "etag": f'"{bibdocid}.pdf{file_version}"', + "recid": recid, + "flags": [], + "hidden": hidden, + "type": file_type, + "full_path": ( + f"/opt/cdsweb/var/data/files/g{bibdocid}/{bibdocid}/" + f"content.pdf;{file_version}" + ), + } + + +def _record(): + """Build a minimal record.""" + return { + "access": "public", + "json": {"metadata": {"publication_date": "2020-01-01"}}, + } + + +@pytest.fixture +def transform(tmp_path): + """Transform instance.""" + return CDSToRDMRecordTransform( + files_dump_dir=tmp_path, + missing_users=tmp_path, + migration_logger=MagicMock(), + ) + + +def test_versions_preserve_file_revision_per_record_version(transform): + """Each record version keeps the file revision.""" + entry = { + "recid": 123, + "files": [ + _file_dump(file_version=1, checksum="checksum-v1"), + _file_dump(file_version=2, checksum="checksum-v2"), + _file_dump(full_name="test.pdf", file_version=1, checksum="checksum-v3"), + ], + } + + versions = transform._versions(entry, _record()) + + assert list(versions.keys()) == [1, 2] + assert versions[1]["files"]["draft.pdf"]["version"] == 1 + assert versions[1]["files"]["draft.pdf"]["checksum"] == "checksum-v1" + assert versions[1]["files"]["test.pdf"]["version"] == 1 + assert versions[2]["files"]["draft.pdf"]["version"] == 2 + assert versions[2]["files"]["test.pdf"]["version"] == 1 + assert versions[2]["files"]["draft.pdf"]["checksum"] == "checksum-v2" + assert versions[1]["files"] is not versions[2]["files"] + + +def test_versions_with_skipped_files(transform): + """Versions with skipped files should not create extra record versions.""" + entry = { + "recid": 123, + "files": [ + _file_dump( + full_name="main.pdf", + file_version=1, + bibdocid=10, + creation_date="2020-01-01T00:00:00+00:00", + ), + _file_dump( + full_name="main.pdf", + file_version=2, + bibdocid=10, + creation_date="2020-01-02T00:00:00+00:00", + ), + _file_dump( + full_name="plot.png", + file_version=1, + file_type="Plot", + bibdocid=20, + creation_date="2020-01-03T00:00:00+00:00", + ), + _file_dump( + full_name="plot.png", + file_version=2, + file_type="Plot", + bibdocid=20, + creation_date="2020-01-04T00:00:00+00:00", + ), + _file_dump( + full_name="plot.png", + file_version=3, + file_type="Plot", + bibdocid=20, + creation_date="2020-01-05T00:00:00+00:00", + ), + _file_dump( + full_name="plot.png", + file_version=4, + file_type="Plot", + bibdocid=20, + creation_date="2020-01-06T00:00:00+00:00", + ), + ], + } + + versions = transform._versions(entry, _record()) + + assert list(versions.keys()) == [1, 2] + assert set(versions[1]["files"]) == {"main.pdf"} + assert set(versions[2]["files"]) == {"main.pdf"} + assert versions[1]["files"]["main.pdf"]["version"] == 1 + assert versions[2]["files"]["main.pdf"]["version"] == 2 + assert "plot.png" not in versions[1]["files"] + assert "plot.png" not in versions[2]["files"]