diff --git a/pyproject.toml b/pyproject.toml index c2a8bbf..774f996 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "welearn-datastack" -version = "1.4.6" +version = "1.4.7" description = "Data stack for WeLearn LPI projects. This pipeline can collect, vectorize and store data from various sources." authors = [ { name = "Théo Nardin", email = "theo.nardin@learningplanetinstitute.org" }, diff --git a/tests/source_models/test_world_bank_okr.py b/tests/source_models/test_world_bank_okr.py index 696f113..e941b53 100644 --- a/tests/source_models/test_world_bank_okr.py +++ b/tests/source_models/test_world_bank_okr.py @@ -1,6 +1,8 @@ import unittest from pathlib import Path +from pydantic import ValidationError + from welearn_datastack.data.source_models.world_bank_okr import WorldBankOKRRecord from welearn_datastack.modules.xml_extractor import XMLExtractor @@ -58,3 +60,33 @@ def test_model(self): model.fileGrp[1].flocat.href, "https://openknowledge.worldbank.org/bitstreams/e189cde3-ebf4-5360-a248-2ea3e05fa5d6/download", ) + + def test_model_without_filegrp(self): + with self.assertRaises(ValidationError): + WorldBankOKRRecord.model_validate( + XMLExtractor(self.example_content.replace("fileGrp", "toto")) + ) + + def test_model_with_empty_filegrp(self): + with self.assertRaises(ValidationError): + WorldBankOKRRecord.model_validate( + XMLExtractor(self.example_content.replace("FLocat", "toto")) + ) + + def test_model_with_empty_identifiers(self): + with self.assertRaises(ValidationError): + WorldBankOKRRecord.model_validate( + XMLExtractor(self.example_content.replace("identifier", "toto")) + ) + + def test_model_without_title(self): + with self.assertRaises(ValidationError): + WorldBankOKRRecord.model_validate( + XMLExtractor(self.example_content.replace("title", "toto")) + ) + + def test_model_without_abstract(self): + with self.assertRaises(ValidationError): + WorldBankOKRRecord.model_validate( + XMLExtractor(self.example_content.replace("abstract", "toto")) + ) diff --git a/welearn_datastack/data/source_models/world_bank_okr.py b/welearn_datastack/data/source_models/world_bank_okr.py index c20d48e..0d9ae6b 100644 --- a/welearn_datastack/data/source_models/world_bank_okr.py +++ b/welearn_datastack/data/source_models/world_bank_okr.py @@ -48,15 +48,23 @@ class WorldBankOKRRecord(BaseModel): @classmethod def _extract_file_grp(cls, value: XMLExtractor) -> list[dict]: ret = [] - file_grp = value.extract_content(tag="fileGrp")[0].content + try: + file_grp = value.extract_content(tag="fileGrp")[0].content + except IndexError: + raise ValueError("There is no fileGrp in this document") for f in XMLExtractor(file_grp).extract_content(tag="file"): f_ret = {k.lower(): v for k, v in f.attributes.items()} flocat_xml = XMLExtractor(f.content).extract_content(tag="FLocat") - flocat_ret = { - k.lower().replace("xlink:", ""): v - for k, v in flocat_xml[0].attributes.items() - } - f_ret["flocat"] = flocat_ret + try: + flocat_ret = { + k.lower().replace("xlink:", ""): v + for k, v in flocat_xml[0].attributes.items() + } + f_ret["flocat"] = flocat_ret + except IndexError: + raise ValueError( + "There is no flocat in this document, so can't find address" + ) ret.append(f_ret) return ret @@ -77,9 +85,12 @@ def _extract_dates(cls, value: XMLExtractor) -> dict: @classmethod def _extract_identifiers(cls, value: XMLExtractor) -> dict[str, str | None]: - uri = value.extract_content_attribute_filter( - tag="mods:identifier", attribute_name="type", attribute_value="uri" - )[0].content + try: + uri = value.extract_content_attribute_filter( + tag="mods:identifier", attribute_name="type", attribute_value="uri" + )[0].content + except IndexError: + raise ValueError("No URI in this document") doi_items = value.extract_content_attribute_filter( tag="mods:identifier", attribute_name="type", attribute_value="doi" ) @@ -95,7 +106,7 @@ def support_xml_extractor(cls, value: Any) -> Any: try: title = value.extract_content(tag="mods:title")[0].content except IndexError: - raise NoTitle + raise ValueError("No title in this document") _authors = [a.content for a in value.extract_content(tag="mods:namePart")] _subjects = [s.content for s in value.extract_content(tag="mods:topic")] @@ -109,7 +120,7 @@ def support_xml_extractor(cls, value: Any) -> Any: try: _abstract = value.extract_content(tag="mods:abstract")[0].content except IndexError: - raise NoDescriptionFoundError + raise ValueError("No abstract in this document") ret = { "authors": _authors,