diff --git a/cds_migrator_kit/rdm/records/transform/models/bulletin_drafts.py b/cds_migrator_kit/rdm/records/transform/models/bulletin_drafts.py new file mode 100644 index 00000000..9601ecc8 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/models/bulletin_drafts.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM Bulletin Drafts model.""" + +from cds_migrator_kit.rdm.records.transform.models.bulletin_issue import ( + bull_issue_model, +) +from cds_migrator_kit.rdm.records.transform.models.staff_association import ( + staff_association_model, +) +from cds_migrator_kit.transform.overdo import CdsOverdo + + +class BulletinDraftsModel(CdsOverdo): + """Translation model for Bulletin Drafts.""" + + __query__ = """( + 980__:"BULLETINSTAFFDRAFT" OR + 980__:"BULLETINNEWSDRAFT" OR + 980__:"BULLETINOFFICIALDRAFT" OR + 980__:"BULLETINTRAININGDRAFT" OR + 980__:"BULLETINANNOUNCEDRAFT" OR + 980__:"BULLETINEVENTSDRAFT" + ) + """ + + # Copy-pasted from bulletin issue + __ignore_keys__ = { + "0248_a", + "0248_p", + "0248_q", + "100__m", # email of contributor + "110__a", # corporate author, always CERN, safe to ignore + "300__a", # number of pages + "336__a", # DM metadata + "5831_2", # DM tags 1054836 + "5831_5", # DM tags + "5831_a", # DM tags + "5831_c", # DM tags + "5831_f", # DM tags + "5831_i", # DM tags + "5831_k", # DM tags + "5831_u", # DM tags + "5831_3", # DM tags + "5831_6", # DM tags + "5831_n", # DM tags + "5831_b", # DM tags + "5831_o", # DM tags + "583__a", # DM tags + "583__c", # DM tags + "583__z", # DM tags + "594__a", # values: "no", "pub" + "650172", # scheme of subjects + "6531_9", # scheme of keywords + "691__a", # draft/online values, redundant + "700__m", # email of contributor + "773__p", # title of the "CERN Bulletin" series + "773__t", # CERN Bulletin value, redundant + "773__y", # year, duplicate of 260 + "8560_f", # contact email + "8564_8", # file id + "8564_s", # bibdoc id + "8564_x", # icon thumbnails sizes + "8564_y", # file description - done by files dump + "8564_2", # DM metadata + "8564_q", # DM metadata + "8564_w", # DM metadata + "8564_z", # DM metadata + "8567_2", # DM tags + "8567_q", # DM tags + "8567_w", # DM tags + "8567_d", # DM tags + "906__m", # edit rights, will be granted by the community + "937__c", # last modified by + "937__s", # last modification date + "960__a", # base number + "961__a", # Curation Auditing tag + "961__b", # Curation Auditing tag + "961__c", # Curation Auditing tag + "961__h", # Curation Auditing tag + "961__l", # Curation Auditing tag + "961__x", # Curation Auditing tag + "981__a", # duplicate record id + # "246_1a", + # "690C_a", + } + + _default_fields = { + "custom_fields": {"journal:journal": {"title": "CERN Bulletin"}}, + "creators": [{"person_or_org": {"type": "organizational", "name": "CERN"}}], + } + + +bulletin_drafts_model = BulletinDraftsModel( + bases=(staff_association_model, bull_issue_model,), + entry_point_group="cds_migrator_kit.migrator.rules.bulletin_drafts", +) diff --git a/cds_migrator_kit/rdm/records/transform/models/bulletin_issue.py b/cds_migrator_kit/rdm/records/transform/models/bulletin_issue.py index fd891855..784e93d2 100644 --- a/cds_migrator_kit/rdm/records/transform/models/bulletin_issue.py +++ b/cds_migrator_kit/rdm/records/transform/models/bulletin_issue.py @@ -15,17 +15,26 @@ class BulletinIssueModel(CdsOverdo): """Translation model for Bulletin Issue.""" - __query__ = """980__:CERN_BULLETIN_ISSUE OR - 980__:CERN_BULLETIN_ARTICLE OR - 980__:BULLETINGENERAL OR - 980__:BULLETINEVENTS OR - 980__:BULLETINANNOUNCE OR - 980__:BULLETINBREAKING OR - 980__:BULLETINNEWS OR - 980__:BULLETINOFFICIAL OR - 980__:BULLETINPENSION OR - 980__:BULLETINTRAINING OR - 980__:BULLETINSOCIAL""" + __query__ = """( + 980__:CERN_BULLETIN_ISSUE OR + 980__:CERN_BULLETIN_ARTICLE OR + 980__:BULLETINGENERAL OR + 980__:BULLETINEVENTS OR + 980__:BULLETINANNOUNCE OR + 980__:BULLETINBREAKING OR + 980__:BULLETINNEWS OR + 980__:BULLETINOFFICIAL OR + 980__:BULLETINPENSION OR + 980__:BULLETINTRAINING OR + 980__:BULLETINSOCIAL + ) + -980__:BULLETINSTAFFDRAFT + -980__:BULLETINNEWSDRAFT + -980__:BULLETINOFFICIALDRAFT + -980__:BULLETINTRAININGDRAFT + -980__:BULLETINANNOUNCEDRAFT + -980__:BULLETINEVENTSDRAFT + """ __ignore_keys__ = { "0248_a", diff --git a/cds_migrator_kit/rdm/records/transform/models/staff_association.py b/cds_migrator_kit/rdm/records/transform/models/staff_association.py new file mode 100644 index 00000000..a2505d37 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/models/staff_association.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM Staff Association model.""" +from cds_migrator_kit.rdm.records.transform.models.bulletin_issue import ( + bull_issue_model, +) +from cds_migrator_kit.transform.overdo import CdsOverdo + + +class StaffAssociationModel(CdsOverdo): + """Translation model for Staff Association.""" + + __query__ = """ + ( + 980__:"BULLETINSTAFF" + -980__:CERN_BULLETIN_ARTICLE + -980__:CERN_BULLETIN_ISSUE + ) + OR + ( + 980__:STAFFASSOCIATION + 594__:PUB + ) + """ + + # Copy-pasted from bulletin issue + __ignore_keys__ = { + "0248_a", + "0248_p", + "0248_q", + "100__m", # email of contributor + "110__a", # corporate author, always CERN, safe to ignore + "300__a", # number of pages + "336__a", # DM metadata + "5831_2", # DM tags 1054836 + "5831_5", # DM tags + "5831_a", # DM tags + "5831_c", # DM tags + "5831_f", # DM tags + "5831_i", # DM tags + "5831_k", # DM tags + "5831_u", # DM tags + "5831_3", # DM tags + "5831_6", # DM tags + "5831_n", # DM tags + "5831_b", # DM tags + "5831_o", # DM tags + "583__a", # DM tags + "583__c", # DM tags + "583__z", # DM tags + "594__a", # values: "no", "pub" + "650172", # scheme of subjects + "6531_9", # scheme of keywords + "691__a", # draft/online values, redundant + "700__m", # email of contributor + "773__p", # title of the "CERN Bulletin" series + "773__t", # CERN Bulletin value, redundant + "773__y", # year, duplicate of 260 + "8560_f", # contact email + "8564_8", # file id + "8564_s", # bibdoc id + "8564_x", # icon thumbnails sizes + "8564_y", # file description - done by files dump + "8564_2", # DM metadata + "8564_q", # DM metadata + "8564_w", # DM metadata + "8564_z", # DM metadata + "8567_2", # DM tags + "8567_q", # DM tags + "8567_w", # DM tags + "8567_d", # DM tags + "906__m", # edit rights, will be granted by the community + "937__c", # last modified by + "937__s", # last modification date + "960__a", # base number + "961__a", # Curation Auditing tag + "961__b", # Curation Auditing tag + "961__c", # Curation Auditing tag + "961__h", # Curation Auditing tag + "961__l", # Curation Auditing tag + "961__x", # Curation Auditing tag + "981__a", # duplicate record id + # "246_1a", + # "690C_a", + } + + _default_fields = { + "custom_fields": {"journal:journal": {"title": "CERN Bulletin"}}, + "creators": [{"person_or_org": {"type": "organizational", "name": "CERN"}}], + } + + +staff_association_model = StaffAssociationModel( + bases=(bull_issue_model,), + entry_point_group="cds_migrator_kit.migrator.rules.staff_association", +) diff --git a/cds_migrator_kit/rdm/records/transform/transform.py b/cds_migrator_kit/rdm/records/transform/transform.py index c2764f92..ec35d307 100644 --- a/cds_migrator_kit/rdm/records/transform/transform.py +++ b/cds_migrator_kit/rdm/records/transform/transform.py @@ -246,13 +246,14 @@ def _owner(self, json_entry): user = User.query.filter_by(email=email).one() return user.id except NoResultFound: - raise UnexpectedValue( - message=f"{email} not found - did you run user migration?", - stage="transform", - recid=json_entry["legacy_recid"], - value=email, - priority="critical", - ) + # return UnexpectedValue( + # message=f"{email} not found - did you run user migration?", + # stage="transform", + # recid=json_entry["legacy_recid"], + # value=email, + # priority="critical", + # ) + return "system" def _match_affiliation(self, affiliation_name, json_entry): """Match an affiliation against `CDSMigrationAffiliationMapping` db table.""" diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py index 188d612c..5ff232bb 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py @@ -819,11 +819,29 @@ def related_identifiers_787(self, key, value): "article":{ "relation_type": {"id": "references"}, "resource_type": {"id": "publication-article"}, - } + }, + "corresponding video": { + "relation_type": {"id": "references"}, + "resource_type": {"id": "audio"}, + }, + "bulletin article": { + "relation_type": {"id": "references"}, + "resource_type": {"id": "publication-periodicalarticle"}, + }, + "report": { + "relation_type": {"id": "isderivedfrom"}, + "resource_type": {"id": "publication-report"}, + }, } if recid: if description: + if description not in relation_map.keys(): + raise UnexpectedValue( + f"Unexpected relation description {description}", + field=key, + value=value, + ) new_id = { "identifier": recid, "scheme": "cds", diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_drafts.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_drafts.py new file mode 100644 index 00000000..a4c50414 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_drafts.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM Bulletin Drafts rules.""" + +from cds_migrator_kit.errors import UnexpectedValue + +from ...models.bulletin_drafts import bulletin_drafts_model as model + +@model.over("resource_type", "^980__", override=True) +def resource_type(self, key, value): + """Translates resource_type for bulletin drafts.""" + value = value.get("a", "").lower() + if value in [ + "bulletinstaffdraft", + "bulletinnewsdraft", + "bulletinofficialdraft", + "bulletintrainingdraft", + "bulletinannouncedraft", + "bulletineventsdraft", + ]: + return {"id": "publication-periodicalarticle"} + raise UnexpectedValue( + "Unknown resource type (BULLETIN DRAFTS)", field=key, value=value + ) diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_issue.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_issue.py index df373cb9..b405f654 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_issue.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/bulletin_issue.py @@ -229,7 +229,7 @@ def urls_bulletin_bis(self, key, value): @model.over("custom_fields_journal", "(^916__)", override=True) -def issue_number(self, key, value): +def custom_fields_journal(self, key, value): _custom_fields = self.get("custom_fields", {}) issue = value.get("z") diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/staff_association.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/staff_association.py new file mode 100644 index 00000000..30709903 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/staff_association.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM Staff Association rules.""" + +from dojson.errors import IgnoreKey + +from cds_migrator_kit.errors import UnexpectedValue +from cds_migrator_kit.rdm.records.transform.xml_processing.rules.base import ( + additional_titles, +) +from cds_migrator_kit.transform.xml_processing.quality.decorators import for_each_value +from cds_migrator_kit.transform.xml_processing.rules.base import ( + record_submitter as base_submitter, +) + +from ...models.staff_association import staff_association_model as model +from .bulletin_issue import collection +from .publications import internal_notes + +model.over("internal_notes", "^562__")(internal_notes) +model.over("additional_titles", "(^242__)")(additional_titles) + + +@model.over("resource_type", "^980__", override=True) +def resource_type(self, key, value): + """Translates resource_type.""" + value = value.get("a") if "a" in value else value.get("b") + if value: + value = value.lower() + if value in ["bulletinstaff", "staffassociation"]: + return {"id": "publication-periodicalarticle"} + raise UnexpectedValue( + "Unknown resource type (STAFF ASSOCIATION)", field=key, value=value + ) + + +@model.over("collection", "^690C_", override=True) +@for_each_value +def staff_association_collection(self, key, value): + """Translates collection field.""" + collection_a = value.get("a", "").strip().lower() + # Drop sa documents + if collection_a == "sa documents": + raise IgnoreKey("collection") + collection(self, key, value) + + +# Known 859__a values that are staff association / bulletin names or typos. +_IGNORED_STAFF_ASSOCIATION_SUBMITTERS = { + "", + " Staff.Bulletin@cern.ch", + "Association du personnel", + "Mutual Aid Fund", + "STAFF ASSOCIATION", + "Saff.Bulletin@cern.ch", + "Satff.Bulletin@cern.ch", + "Satff.bulletin@cern.ch", + "Staff Association", + "Staff. Bulletin@cern.ch", + "Staff. bulletin@cern.ch", + "Staff.Asscociation@cern.ch", + "Staff.Association@cern.ch", + "Staff.Bulletin-editors@cern.ch", + "Staff.Bulletin@cern.ch", + "Staff.Kindergarten@cern.ch", + "Staff.association@cern.ch", + "Staff.bulletin@cern.ch", + "Staff.bulletins@cern.ch", + "Staff:Bulletin@cern.ch", + "bulletin-editors@cern.ch", + "cern.bulletin@cern.ch", + "staff-bulletin@cern.ch", + "staff.asociation@cern.ch", + "staff.association", + "staff.association@cern.ch", + "staff.bullelin@cern.ch", + "staff.bulletin@Cern.ch", + "staff.bulletin@cern.", + "staff.bulletin@cern.ch", + "staff.bulletin@ern.ch", + "staff.bulletins@cern.ch", + "staff.buttetin@cern.ch", + "statt.bulletin@cern.ch", + "stff.bulletin@cern.ch", +} + + +@model.over("submitter", "(^859__)", override=True) +def staff_contact_person(self, key, value): + """Translates contact person field from submitters tag to populate additional descriptions field.""" + contact_person = value.get("a", "") + if contact_person and contact_person not in _IGNORED_STAFF_ASSOCIATION_SUBMITTERS: + self.setdefault( + "additional_descriptions", [] + ) # In case the field already exists, don't overwrite it + self["additional_descriptions"].append( + { + "description": f"

Contact: {contact_person}

", + "type": { + "id": "other", + }, + } + ) + submitter = base_submitter(self, key, value) + return submitter diff --git a/cds_migrator_kit/rdm/streams.yaml b/cds_migrator_kit/rdm/streams.yaml index 0a45f796..8bbdc60e 100644 --- a/cds_migrator_kit/rdm/streams.yaml +++ b/cds_migrator_kit/rdm/streams.yaml @@ -472,3 +472,26 @@ records: missing_users: cds_migrator_kit/rdm/data/users communities_ids: - "" + staff_association: + data_dir: cds_migrator_kit/rdm/data/staff_association + tmp_dir: cds_migrator_kit/rdm/tmp/staff_association + log_dir: cds_migrator_kit/rdm/log/staff_association + extract: + dirpath: cds_migrator_kit/rdm/data/staff_association/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/staff_association/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "27b118cb-6c99-4ee3-969f-cde758a83124" + bulletin_drafts: + data_dir: cds_migrator_kit/rdm/data/bulletin_drafts + tmp_dir: cds_migrator_kit/rdm/tmp/bulletin_drafts + log_dir: cds_migrator_kit/rdm/log/bulletin_drafts + restricted: "True" + extract: + dirpath: cds_migrator_kit/rdm/data/bulletin_drafts/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/bulletin_drafts/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "e849139c-dff5-45a7-90b1-90cdddc9c00e" diff --git a/setup.cfg b/setup.cfg index 54169023..a69849d8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -86,7 +86,8 @@ cds_migrator_kit.migrator.models = te = cds_migrator_kit.rdm.records.transform.models.te:te_model en = cds_migrator_kit.rdm.records.transform.models.en:en_model annual_rep = cds_migrator_kit.rdm.records.transform.models.annual_report:annual_rep_model - + bulletin_drafts = cds_migrator_kit.rdm.records.transform.models.bulletin_drafts:bulletin_drafts_model + staff_association = cds_migrator_kit.rdm.records.transform.models.staff_association:staff_association_model cds_migrator_kit.migrator.rules.base = base = cds_migrator_kit.transform.xml_processing.rules.base cds_migrator_kit.migrator.rdm.rules.base = @@ -168,6 +169,14 @@ cds_migrator_kit.migrator.rules.fap = base = cds_migrator_kit.transform.xml_processing.rules.base base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base fap = cds_migrator_kit.rdm.records.transform.xml_processing.rules.fap +cds_migrator_kit.migrator.rules.bulletin_drafts = + base = cds_migrator_kit.transform.xml_processing.rules.base + base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base + bulletin_drafts = cds_migrator_kit.rdm.records.transform.xml_processing.rules.bulletin_drafts +cds_migrator_kit.migrator.rules.staff_association = + base = cds_migrator_kit.transform.xml_processing.rules.base + base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base + staff_association = cds_migrator_kit.rdm.records.transform.xml_processing.rules.staff_association cds_migrator_kit.migrator.rules.people = people = cds_migrator_kit.rdm.users.transform.xml_processing.rules.people invenio_pidstore.minters =