CyberCRI · lpi-tn · Jun 11, 2026 · Jun 12, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "welearn-datastack"
-version = "1.4.6"
+version = "1.4.7"
 description = "Data stack for WeLearn LPI projects. This pipeline can collect, vectorize and store data from various sources."
 authors = [
     { name = "Théo Nardin", email = "theo.nardin@learningplanetinstitute.org" },

diff --git a/tests/source_models/test_world_bank_okr.py b/tests/source_models/test_world_bank_okr.py
@@ -1,6 +1,8 @@
 import unittest
 from pathlib import Path
 
+from pydantic import ValidationError
+
 from welearn_datastack.data.source_models.world_bank_okr import WorldBankOKRRecord
 from welearn_datastack.modules.xml_extractor import XMLExtractor
 
@@ -58,3 +60,33 @@ def test_model(self):
             model.fileGrp[1].flocat.href,
             "https://openknowledge.worldbank.org/bitstreams/e189cde3-ebf4-5360-a248-2ea3e05fa5d6/download",
         )
+
+    def test_model_without_filegrp(self):
+        with self.assertRaises(ValidationError):
+            WorldBankOKRRecord.model_validate(
+                XMLExtractor(self.example_content.replace("fileGrp", "toto"))
+            )
+
+    def test_model_with_empty_filegrp(self):
+        with self.assertRaises(ValidationError):
+            WorldBankOKRRecord.model_validate(
+                XMLExtractor(self.example_content.replace("FLocat", "toto"))
+            )
+
+    def test_model_with_empty_identifiers(self):
+        with self.assertRaises(ValidationError):
+            WorldBankOKRRecord.model_validate(
+                XMLExtractor(self.example_content.replace("identifier", "toto"))
+            )
+
+    def test_model_without_title(self):
+        with self.assertRaises(ValidationError):
+            WorldBankOKRRecord.model_validate(
+                XMLExtractor(self.example_content.replace("title", "toto"))
+            )
+
+    def test_model_without_abstract(self):
+        with self.assertRaises(ValidationError):
+            WorldBankOKRRecord.model_validate(
+                XMLExtractor(self.example_content.replace("abstract", "toto"))
+            )
diff --git a/welearn_datastack/data/source_models/world_bank_okr.py b/welearn_datastack/data/source_models/world_bank_okr.py
@@ -48,15 +48,23 @@ class WorldBankOKRRecord(BaseModel):
     @classmethod
     def _extract_file_grp(cls, value: XMLExtractor) -> list[dict]:
         ret = []
-        file_grp = value.extract_content(tag="fileGrp")[0].content
+        try:
+            file_grp = value.extract_content(tag="fileGrp")[0].content
+        except IndexError:
+            raise ValueError("There is no fileGrp in this document")
         for f in XMLExtractor(file_grp).extract_content(tag="file"):
             f_ret = {k.lower(): v for k, v in f.attributes.items()}
             flocat_xml = XMLExtractor(f.content).extract_content(tag="FLocat")
-            flocat_ret = {
-                k.lower().replace("xlink:", ""): v
-                for k, v in flocat_xml[0].attributes.items()
-            }
-            f_ret["flocat"] = flocat_ret
+            try:
+                flocat_ret = {
+                    k.lower().replace("xlink:", ""): v
+                    for k, v in flocat_xml[0].attributes.items()
+                }
+                f_ret["flocat"] = flocat_ret
+            except IndexError:
+                raise ValueError(
+                    "There is no flocat in this document, so can't find address"
+                )
             ret.append(f_ret)
         return ret
 
@@ -77,9 +85,12 @@ def _extract_dates(cls, value: XMLExtractor) -> dict:
 
     @classmethod
     def _extract_identifiers(cls, value: XMLExtractor) -> dict[str, str | None]:
-        uri = value.extract_content_attribute_filter(
-            tag="mods:identifier", attribute_name="type", attribute_value="uri"
-        )[0].content
+        try:
+            uri = value.extract_content_attribute_filter(
+                tag="mods:identifier", attribute_name="type", attribute_value="uri"
+            )[0].content
+        except IndexError:
+            raise ValueError("No URI in this document")
         doi_items = value.extract_content_attribute_filter(
             tag="mods:identifier", attribute_name="type", attribute_value="doi"
         )
@@ -95,7 +106,7 @@ def support_xml_extractor(cls, value: Any) -> Any:
             try:
                 title = value.extract_content(tag="mods:title")[0].content
             except IndexError:
-                raise NoTitle
+                raise ValueError("No title in this document")
 
             _authors = [a.content for a in value.extract_content(tag="mods:namePart")]
             _subjects = [s.content for s in value.extract_content(tag="mods:topic")]
@@ -109,7 +120,7 @@ def support_xml_extractor(cls, value: Any) -> Any:
             try:
                 _abstract = value.extract_content(tag="mods:abstract")[0].content
             except IndexError:
-                raise NoDescriptionFoundError
+                raise ValueError("No abstract in this document")
 
             ret = {
                 "authors": _authors,