Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "welearn-datastack"
version = "1.4.6"
version = "1.4.7"
description = "Data stack for WeLearn LPI projects. This pipeline can collect, vectorize and store data from various sources."
authors = [
{ name = "Théo Nardin", email = "theo.nardin@learningplanetinstitute.org" },
Expand Down
32 changes: 32 additions & 0 deletions tests/source_models/test_world_bank_okr.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import unittest
from pathlib import Path

from pydantic import ValidationError

from welearn_datastack.data.source_models.world_bank_okr import WorldBankOKRRecord
from welearn_datastack.modules.xml_extractor import XMLExtractor

Expand Down Expand Up @@ -58,3 +60,33 @@ def test_model(self):
model.fileGrp[1].flocat.href,
"https://openknowledge.worldbank.org/bitstreams/e189cde3-ebf4-5360-a248-2ea3e05fa5d6/download",
)

def test_model_without_filegrp(self):
with self.assertRaises(ValidationError):
WorldBankOKRRecord.model_validate(
XMLExtractor(self.example_content.replace("fileGrp", "toto"))
)

def test_model_with_empty_filegrp(self):
with self.assertRaises(ValidationError):
WorldBankOKRRecord.model_validate(
XMLExtractor(self.example_content.replace("FLocat", "toto"))
)

def test_model_with_empty_identifiers(self):
with self.assertRaises(ValidationError):
WorldBankOKRRecord.model_validate(
XMLExtractor(self.example_content.replace("identifier", "toto"))
)

def test_model_without_title(self):
with self.assertRaises(ValidationError):
WorldBankOKRRecord.model_validate(
XMLExtractor(self.example_content.replace("title", "toto"))
)

def test_model_without_abstract(self):
with self.assertRaises(ValidationError):
WorldBankOKRRecord.model_validate(
XMLExtractor(self.example_content.replace("abstract", "toto"))
)
33 changes: 22 additions & 11 deletions welearn_datastack/data/source_models/world_bank_okr.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,23 @@ class WorldBankOKRRecord(BaseModel):
@classmethod
def _extract_file_grp(cls, value: XMLExtractor) -> list[dict]:
ret = []
file_grp = value.extract_content(tag="fileGrp")[0].content
try:
file_grp = value.extract_content(tag="fileGrp")[0].content
except IndexError:
raise ValueError("There is no fileGrp in this document")
for f in XMLExtractor(file_grp).extract_content(tag="file"):
f_ret = {k.lower(): v for k, v in f.attributes.items()}
flocat_xml = XMLExtractor(f.content).extract_content(tag="FLocat")
flocat_ret = {
k.lower().replace("xlink:", ""): v
for k, v in flocat_xml[0].attributes.items()
}
f_ret["flocat"] = flocat_ret
try:
flocat_ret = {
k.lower().replace("xlink:", ""): v
for k, v in flocat_xml[0].attributes.items()
}
f_ret["flocat"] = flocat_ret
except IndexError:
raise ValueError(
"There is no flocat in this document, so can't find address"
)
ret.append(f_ret)
return ret

Expand All @@ -77,9 +85,12 @@ def _extract_dates(cls, value: XMLExtractor) -> dict:

@classmethod
def _extract_identifiers(cls, value: XMLExtractor) -> dict[str, str | None]:
uri = value.extract_content_attribute_filter(
tag="mods:identifier", attribute_name="type", attribute_value="uri"
)[0].content
try:
uri = value.extract_content_attribute_filter(
tag="mods:identifier", attribute_name="type", attribute_value="uri"
)[0].content
except IndexError:
raise ValueError("No URI in this document")
doi_items = value.extract_content_attribute_filter(
tag="mods:identifier", attribute_name="type", attribute_value="doi"
)
Expand All @@ -95,7 +106,7 @@ def support_xml_extractor(cls, value: Any) -> Any:
try:
title = value.extract_content(tag="mods:title")[0].content
except IndexError:
raise NoTitle
raise ValueError("No title in this document")

_authors = [a.content for a in value.extract_content(tag="mods:namePart")]
_subjects = [s.content for s in value.extract_content(tag="mods:topic")]
Expand All @@ -109,7 +120,7 @@ def support_xml_extractor(cls, value: Any) -> Any:
try:
_abstract = value.extract_content(tag="mods:abstract")[0].content
except IndexError:
raise NoDescriptionFoundError
raise ValueError("No abstract in this document")

ret = {
"authors": _authors,
Expand Down
Loading