Skip to content

Commit 35c2e12

Browse files
authored
Merge pull request #377 from MLMI2-CSSI/metadata_error_handling
Metadata error handling
2 parents 5a7dfbb + 5564660 commit 35c2e12

3 files changed

Lines changed: 149 additions & 15 deletions

File tree

foundry/foundry.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from json2table import convert
55
import numpy as np
66
import pandas as pd
7+
from pydantic import ValidationError
78
from typing import Any, Dict, List
89
import logging
910
import warnings
@@ -23,15 +24,16 @@
2324
from foundry.models import (
2425
FoundryMetadata,
2526
FoundryConfig,
26-
FoundryDataset
27+
FoundryDataset,
28+
FoundryBase
2729
)
2830
from foundry.https_download import download_file, recursive_ls
2931
from foundry.https_upload import upload_to_endpoint
3032

3133
logger = logging.getLogger(__name__)
3234

3335

34-
class Foundry(FoundryMetadata):
36+
class Foundry(FoundryBase):
3537
"""Foundry Client Base Class
3638
TODO:
3739
-------
@@ -49,7 +51,8 @@ class Foundry(FoundryMetadata):
4951

5052
def __init__(
5153
self, name=None, no_browser=False, no_local_server=False, index="mdf", authorizers=None,
52-
download=True, globus=True, verbose=False, metadata=None, interval=10, **data
54+
download=True, globus=True, verbose=False, metadata=None, interval=10,
55+
**data
5356
):
5457
"""Initialize a Foundry client
5558
Args:
@@ -75,6 +78,13 @@ def __init__(
7578
self.index = index
7679
self.auths = None
7780

81+
self.config = FoundryConfig(
82+
dataframe_file="foundry_dataframe.json",
83+
metadata_file="foundry_metadata.json",
84+
local=False,
85+
local_cache_dir="./data",
86+
)
87+
7888
if authorizers:
7989
self.auths = authorizers
8090
else:
@@ -392,6 +402,9 @@ def publish_dataset(
392402
of dataset. Contains `source_id`, which can be used to check the
393403
status of the submission
394404
"""
405+
# ensure metadata is properly formatted
406+
self.validate_metadata(foundry_metadata)
407+
395408
# ensure that one of `https_data_path` or `globus_data_source` have been assigned values
396409
if (https_data_path and globus_data_source) or \
397410
(https_data_path is None and globus_data_source is None):
@@ -774,3 +787,27 @@ def to_tensorflow(self, split: str = None):
774787

775788
inputs, targets = self._get_inputs_targets(split)
776789
return TensorflowSequence(inputs, targets)
790+
791+
def validate_metadata(self, metadata):
792+
"""Validate the JSON message against the FoundryMetadata model
793+
794+
Arguments:
795+
metadata (dict): Metadata information provided by the user.
796+
797+
Raises:
798+
ValidationError: if metadata supplied by user does not meet the specificiation of a
799+
FoundryMetadata object.
800+
801+
"""
802+
try:
803+
FoundryMetadata(**metadata)
804+
logger.debug("Metadata validation successful!")
805+
except ValidationError as e:
806+
logger.error("Metadata validation failed!")
807+
for error in e.errors():
808+
field_name = ".".join([item for item in error['loc'] if isinstance(item, str)])
809+
error_description = error['msg']
810+
error_message = f"""There is an issue validating the metadata for the field '{field_name}':
811+
The error message returned is: '{error_description}'."""
812+
logger.error(error_message)
813+
raise e

foundry/models.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from typing import List, Dict, Optional, Any
2-
from pydantic import BaseModel
2+
from pydantic import BaseModel, Field, StrictInt, StrictStr
33
from enum import Enum
44
import pandas as pd
55
from json2table import convert
@@ -58,6 +58,7 @@ def _repr_html_(self):
5858

5959
# END Classes for Foundry Data Package Specification
6060

61+
6162
class FoundryDatasetType(Enum):
6263
"""Foundry Dataset Types
6364
Enumeration of the possible Foundry dataset types
@@ -70,23 +71,40 @@ class FoundryDatasetType(Enum):
7071

7172

7273
class FoundryKeyClass(BaseModel):
73-
label: str = ""
74-
name: str = ""
74+
label: StrictStr = Field(..., description="The label that exists in the data")
75+
name: StrictStr = Field(..., description="The name the label maps onto.")
7576

7677

7778
class FoundryKey(BaseModel):
78-
key: List[str] = []
79-
type: str = ""
80-
filter: Optional[str] = ""
81-
units: Optional[str] = ""
82-
description: Optional[str] = ""
79+
key: List[StrictStr] = Field(..., description="Column or header name for tabular data, key/path for HDF5 data")
80+
type: StrictStr = Field(..., description="Whether input or target")
8381
classes: Optional[List[FoundryKeyClass]]
82+
description: Optional[StrictStr]
83+
filter: Optional[StrictStr]
84+
units: Optional[StrictStr]
8485

8586

8687
class FoundrySplit(BaseModel):
87-
type: str = ""
88-
path: Optional[str] = ""
89-
label: Optional[str] = ""
88+
type: StrictStr = Field(..., description="The kind of partition of the dataset (train, test, validation, etc)")
89+
path: Optional[StrictStr]
90+
label: Optional[StrictStr]
91+
92+
93+
class FoundryMetadata(BaseModel):
94+
"""Foundry Dataset
95+
Schema for Foundry Datasets. This includes specifications of inputs, outputs, type, version, and more
96+
"""
97+
data_type: FoundryDatasetType = Field(..., description="The kind of data in the dataset, e.g. tabular, json, hdf5")
98+
domain: List[StrictStr] = Field(..., description="The domain of applicability. e.g., materials science, chemistry, machine vision")
99+
keys: List[FoundryKey] = Field(..., description="Keys describing how to load the data")
100+
dataframe: Optional[Any]
101+
n_items: Optional[StrictInt]
102+
short_name: Optional[StrictStr]
103+
splits: Optional[List[FoundrySplit]]
104+
task_type: Optional[List[StrictStr]]
105+
106+
class Config:
107+
arbitrary_types_allowed = True
90108

91109

92110
class FoundryDataset(BaseModel):
@@ -134,7 +152,7 @@ def _repr_html_(self):
134152
return convert(json.loads(self.json()))
135153

136154

137-
class FoundryMetadata(BaseModel):
155+
class FoundryBase(BaseModel):
138156
dc: Optional[Dict] = {} # pydantic Datacite?
139157
mdf: Optional[Dict] = {}
140158
dataset: FoundryDataset = {}

tests/test_foundry.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from datetime import datetime
77
from math import floor
88
import numpy as np
9+
from pydantic import ValidationError
910
import requests
1011
import mdf_toolbox
1112
import pandas as pd
@@ -134,6 +135,66 @@
134135
'n_items': 1000
135136
}
136137

138+
139+
pub_test_invalid_metadata = {
140+
"keys": [
141+
{
142+
"key": ["sepal length (cm)"],
143+
"type": "input",
144+
"units": "cm",
145+
"description": 10
146+
},
147+
{
148+
"key": ["sepal width (cm)"],
149+
"type": "input",
150+
"units": "cm",
151+
"description": "sepal width in unit(cm)"
152+
},
153+
{
154+
"key": ["petal length (cm)"],
155+
"type": "input",
156+
"units": "cm",
157+
"description": "petal length in unit(cm)"
158+
},
159+
{
160+
"key": ["petal width (cm)"],
161+
"type": "input",
162+
"units": "cm",
163+
"description": "petal width in unit(cm)"
164+
},
165+
{
166+
"key": ["y"],
167+
"type": "output",
168+
"units": "",
169+
"description": "flower type",
170+
"classes": [
171+
{
172+
"label": "0",
173+
"name": "setosa"
174+
},
175+
{
176+
"label": "1",
177+
"name": "versicolor"
178+
},
179+
{
180+
"label": "2",
181+
"name": "virginica"
182+
}
183+
]
184+
}
185+
],
186+
'splits': [
187+
{'label': 'train', 'path': 'train.json', 'type': 'train'},
188+
{'label': 'test', 'path': 'test.json', 'type': 'test'}
189+
],
190+
"short_name": "example_AS_iris_test_{:.0f}".format(datetime.now().timestamp()),
191+
"data_type": "tabular",
192+
'task_type': ['unsupervised', 'generative'],
193+
'domain': ['materials science', 'chemistry'],
194+
'n_items': 1000
195+
}
196+
197+
137198
# Globus endpoint for '_iris_dev' for test publication
138199
pub_test_data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry-test%2Firis-dev%2F"
139200

@@ -300,6 +361,24 @@ def test_publish_with_https():
300361
assert res['source_id'] == f"_test_{short_name}_v1.1"
301362

302363

364+
def test_publish_invalid_metadata():
365+
"""Testing the validation of the metadata when publishing data
366+
"""
367+
with pytest.raises(ValidationError) as exc_info:
368+
f = Foundry(index="mdf-test", authorizers=auths)
369+
timestamp = datetime.now().timestamp()
370+
title = "https_publish_test_{:.0f}".format(timestamp)
371+
short_name = "https_pub_{:.0f}".format(timestamp)
372+
authors = ["A Scourtas"]
373+
local_path = "./data/https_test"
374+
375+
# create test JSON to upload (if it doesn't already exist)
376+
_write_test_data(local_path)
377+
f.publish_dataset(pub_test_invalid_metadata, title, authors, https_data_path=local_path, short_name=short_name)
378+
379+
assert exc_info.value.errors()[0]['msg'] == 'str type expected'
380+
381+
303382
def test_upload_to_endpoint():
304383
"""Unit test: Test the _upload_to_endpoint() HTTPS functionality on its own, without publishing to MDF
305384
"""

0 commit comments

Comments
 (0)