Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions cds_migrator_kit/rdm/records/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,11 +907,7 @@ def compute_access(file, record_access):
"meta": file["status"],
}

def compute_files(file_dump, versions_dict):
legacy_path_root = Path("/opt/cdsweb/var/data/files/")
tmp_eos_root = Path(self.files_dump_dir)
full_path = Path(file_dump["full_path"])

def should_skip_file(file_dump):
if file_dump["subformat"] in FILE_SUBFORMATS_TO_DROP:
self.migration_logger.add_information(
str(file_dump["recid"]),
Expand All @@ -920,7 +916,7 @@ def compute_files(file_dump, versions_dict):
"value": file_dump["full_name"],
},
)
return
return True

if not self.plots and file_dump["type"] == "Plot":
# skip figures if configuration says so
Expand All @@ -931,7 +927,7 @@ def compute_files(file_dump, versions_dict):
"value": file_dump["full_name"],
},
)
return
return True
if file_dump["hidden"]:
# skip hidden files
self.migration_logger.add_information(
Expand All @@ -941,6 +937,13 @@ def compute_files(file_dump, versions_dict):
"value": file_dump["full_name"],
},
)
return True
return False

def compute_files(file_dump, versions_dict):
legacy_path_root = Path("/opt/cdsweb/var/data/files/")
tmp_eos_root = Path(self.files_dump_dir)
full_path = Path(file_dump["full_path"])

versions_dict[file_dump["version"]]["files"].update(
{
Expand Down Expand Up @@ -978,6 +981,8 @@ def compute_files(file_dump, versions_dict):
_files = entry["files"]
record_access = record["access"]
for file in _files:
if should_skip_file(file):
continue
if file["version"] not in versions:
versions[file["version"]] = {
"files": {},
Expand All @@ -996,7 +1001,7 @@ def compute_files(file_dump, versions_dict):
# we need to preserve the file B for version 2 of the record
for version in versions.keys():
versioned_files |= versions.get(version, {}).get("files")
versions[version]["files"] = versioned_files
versions[version]["files"] = deepcopy(versioned_files)
publication_date = record["json"]["metadata"]["publication_date"]

if not versioned_files:
Expand Down
165 changes: 165 additions & 0 deletions tests/cds-rdm/test_transform_versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""Tests for record version file snapshot logic in transform._versions()."""

from unittest.mock import MagicMock

import pytest

from cds_migrator_kit.rdm.records.transform.transform import CDSToRDMRecordTransform


def _file_dump(
*,
full_name="draft.pdf",
file_version=1,
file_type="Main",
creation_date="2020-01-01T00:00:00+00:00",
checksum=None,
hidden=False,
subformat="",
recid=123,
bibdocid=1,
):
"""Build a minimal legacy file dump entry."""
checksum = checksum or f"checksum-v{file_version}"
return {
"comment": None,
"status": "",
"version": file_version,
"encoding": None,
"creation_date": creation_date,
"bibdocid": bibdocid,
"mime": "application/pdf",
"full_name": full_name,
"superformat": ".pdf",
"recids_doctype": [[recid, file_type, full_name]],
"path": (
f"/opt/cdsweb/var/data/files/g{bibdocid}/{bibdocid}/"
f"content.pdf;{file_version}"
),
"size": 1000,
"license": {},
"modification_date": creation_date,
"copyright": {},
"url": f"http://cds.cern.ch/record/{recid}/files/{full_name}",
"checksum": checksum,
"description": None,
"format": ".pdf",
"name": full_name.rsplit(".", 1)[0],
"subformat": subformat,
"etag": f'"{bibdocid}.pdf{file_version}"',
"recid": recid,
"flags": [],
"hidden": hidden,
"type": file_type,
"full_path": (
f"/opt/cdsweb/var/data/files/g{bibdocid}/{bibdocid}/"
f"content.pdf;{file_version}"
),
}


def _record():
"""Build a minimal record."""
return {
"access": "public",
"json": {"metadata": {"publication_date": "2020-01-01"}},
}


@pytest.fixture
def transform(tmp_path):
"""Transform instance."""
return CDSToRDMRecordTransform(
files_dump_dir=tmp_path,
missing_users=tmp_path,
migration_logger=MagicMock(),
)


def test_versions_preserve_file_revision_per_record_version(transform):
"""Each record version keeps the file revision."""
entry = {
"recid": 123,
"files": [
_file_dump(file_version=1, checksum="checksum-v1"),
_file_dump(file_version=2, checksum="checksum-v2"),
_file_dump(full_name="test.pdf", file_version=1, checksum="checksum-v3"),
],
}

versions = transform._versions(entry, _record())

assert list(versions.keys()) == [1, 2]
assert versions[1]["files"]["draft.pdf"]["version"] == 1
assert versions[1]["files"]["draft.pdf"]["checksum"] == "checksum-v1"
assert versions[1]["files"]["test.pdf"]["version"] == 1
assert versions[2]["files"]["draft.pdf"]["version"] == 2
assert versions[2]["files"]["test.pdf"]["version"] == 1
assert versions[2]["files"]["draft.pdf"]["checksum"] == "checksum-v2"
assert versions[1]["files"] is not versions[2]["files"]


def test_versions_with_skipped_files(transform):
"""Versions with skipped files should not create extra record versions."""
entry = {
"recid": 123,
"files": [
_file_dump(
full_name="main.pdf",
file_version=1,
bibdocid=10,
creation_date="2020-01-01T00:00:00+00:00",
),
_file_dump(
full_name="main.pdf",
file_version=2,
bibdocid=10,
creation_date="2020-01-02T00:00:00+00:00",
),
_file_dump(
full_name="plot.png",
file_version=1,
file_type="Plot",
bibdocid=20,
creation_date="2020-01-03T00:00:00+00:00",
),
_file_dump(
full_name="plot.png",
file_version=2,
file_type="Plot",
bibdocid=20,
creation_date="2020-01-04T00:00:00+00:00",
),
_file_dump(
full_name="plot.png",
file_version=3,
file_type="Plot",
bibdocid=20,
creation_date="2020-01-05T00:00:00+00:00",
),
_file_dump(
full_name="plot.png",
file_version=4,
file_type="Plot",
bibdocid=20,
creation_date="2020-01-06T00:00:00+00:00",
),
],
}

versions = transform._versions(entry, _record())

assert list(versions.keys()) == [1, 2]
assert set(versions[1]["files"]) == {"main.pdf"}
assert set(versions[2]["files"]) == {"main.pdf"}
assert versions[1]["files"]["main.pdf"]["version"] == 1
assert versions[2]["files"]["main.pdf"]["version"] == 2
assert "plot.png" not in versions[1]["files"]
assert "plot.png" not in versions[2]["files"]
Loading